{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Length of Captions in Dataset\n",
    "\n",
    "统计一下 caption 的长度, 看看选择 len=77 是否合理"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from pathlib import Path\n",
    "from tqdm import tqdm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "len(captions_len): 8176\n"
     ]
    }
   ],
   "source": [
    "input_filepath = \"/remote-home/share/medical/public/ROCO/test/radiology/processed_test.csv\"\n",
    "df = pd.read_csv(input_filepath, sep=',')\n",
    "captions = df[\"caption\"].tolist()\n",
    "captions_len = [len(caption.split(' ')) for caption in captions]\n",
    "print(f\"len(captions_len): {len(captions_len)}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Draw Histgram for Caption Length"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYUAAAEWCAYAAACJ0YulAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8vihELAAAACXBIWXMAAAsTAAALEwEAmpwYAAAazUlEQVR4nO3df9jldV3n8edLRDQhgRhoHNAhG9vAtlHvyC6rBTVBtmmgzRrX9SJjgza40iv3B5gbWE2xW2B2pbZjkHNtKs6qKFNUImFmmTAQIcPAOgnKOBMzKgTYRs7w3j/O5/5ymDn3PeeemXOf+8fzcV3nOud8vr/e5ztnzuv+fL7f8z2pKiRJAnjauAuQJM0dhoIkqWMoSJI6hoIkqWMoSJI6hoIkqWMoaNYl2Zzk9FnYzn9K8mCSx5J826i3N0Q9s/K6xyVJJfnOcdehg2Mo6JBKcn+SV+3V9tNJPjP5vKpOrapP7Wc9y9uHzNMPsI7DgauAV1fVkVX1tQHzPCPJ5Um+kOQbrfZrkiw/kG3ute73Jfm1/rZhXvcBbutTSf7joV7vXNumZoehoIXqBOCZwOZp5vkw8GPAvweeA3wvcBvwypFXJ81RhoJmXX9vIslpSTYleaQN9VzVZvt0u3+4Df/8wID1HJHkt5Nsb7ffbm0vBO7tW/7PByz7KuBHgNVVdWtV7a6qf6yqd1XV1W2eNybZkuTRJF9McmHf8qcn2ZbkrUm+2l7T69u0C4DXA/+11b5xwOseWPte635Lkp1JdiR54wHu659pr+GhJH+W5Pl90yrJz7We0kNJ3pUkbdphSa5sr+2+JBdP9tySrAV+CPjd9vp+t2+Trxq0Ps0fhoLG7Z3AO6vqW4EXABta+w+3+6Pb8M9nByz7S8DLgJX0/so/DXhbVf1f4NS+5V8xYNlXAbdU1QPT1LYT+FHgW4E3Au9I8pK+6d8OHAcsA84D1iX5rqpaB7wf+J+t9lXD1r7Xup/T1n0+8K4kx0xT6z6SnAO8FfhxYAnwl8AH95rtR4HvazX8JHBma/9Z4DWtvpcA50wuUFW/1NZ1cXt9Fw+xPs0ThoJG4WNJHp68Ae+eZt5vAt+Z5Liqeqyq/mYG23k98CtVtbOqdgFvB94w5LLfBuyYboaq+uOq+vvq+QvgE/T+Qu7336vq8Tb9j+l9EB6K2r/Zpn+zqm4AHgO+a8h1T7oQ+I2q2lJVu4FfB1b29xaAK6rq4ar6MnAzvRCgvY53VtW2qnoIuGLIbU61Ps0ThoJG4ZyqOnryBvz8NPOeD7wQuCfJrUl+dAbbeS7wpb7nX2ptw/gasHS6GZK8JsnfJPl6C7ez6fUMJj1UVd84wO3vr/avtQ/ySf8EHDnkuic9H3hnXzh/HQi93sekf5hiG88F+ntR0/Wo+k21Ps0ThoLGqqq+UFWvA44H/gfw4STPBoa5fO92eh98k57X2obxSeC0JCcOmtjG9z8C/BZwQgu3G+h9qE46ptU6aPv7q/9gah/WA8CF/QFdVc+qqr8eYtkdQP++OWmv6V5eeYEyFDRWSf5DkiVV9QTwcGveA+wCngC+Y5rFPwi8LcmSJMcBvwz84TDbrapPAjcC1yV5aTuAelQ78PozwDOAI1odu5O8Bnj1gFW9vZ3a+kP0xtP/T2t/cFS1T+HpSZ7Zdzsc+D3g0iSnAiR5TpLXDrm+DcCbkixLcjTw3/aavr/Xp3nKUNC4nQVsTvIYvYPOa6rqn6vqn4C1wF+14Y+XDVj214BNwJ3A54HbW9uwfoLeX/8fAv4RuAuYAD5ZVY8Cv0Dvw/EheqetXr/X8v/Qpm2nd2D556rqnjbtauCUVvvHRlD73t4D/L++2x9U1XX0el/XJnmkvb7XDLm+99I7hnIn8Lf09tNueoENvX+rn2hnGf3OQdStOSb+yI40c+l9M/kPq2rg8NNC03pKv1dVz9/vzJrX7ClI2keSZyU5uw2rLQMuA64bd10aPUNB0iChd5rsQ/SGj7bQO+6hBc7hI0lSx56CJKlzQFegnCuOO+64Wr58+bjLkKR55bbbbvtqVS0ZNG1eh8Ly5cvZtGnTuMuQpHklyZemmubwkSSpYyhIkjqGgiSpYyhIkjqGgiSpYyhIkjqGgiSpYyhIkjqGgiSpYygcoFWrVo27BEk65EYWCu0nAW9J8ndJNid5e2u/PMlXktzRbmf3LXNpkq1J7k1y5qhqO1AGgaSFbpTXPnoceEVVPdZ+L/YzSf6kTXtHVf1W/8xJTgHWAKcCzwU+meSFVbUHSdKsGFlPoXoea08Pb7fpfrxhNXBtVT1eVfcBW4HTRlWfJGlfIz2mkOSwJHcAO4Ebq+pzbdLFSe5Mck2SY1rbMuCBvsW3tba913lBkk1JNu3atWuU5UvSojPSUKiqPVW1EjgROC3Ji4D3AC8AVgI7gCvb7Bm0igHrXFdVE1U1sWTJwMuBH3KrVq3yeIKkRWFWzj6qqoeBTwFnVdWDLSyeAN7Lk0NE24CT+hY7Edg+G/VJknpGefbRkiRHt8fPAl4F3JNkad9s5wJ3tcfXA2uSHJHkZGAFcMuo6pMk7WuUZx8tBdYnOYxe+Gyoqj9K8r+TrKQ3NHQ/cCFAVW1OsgG4G9gNXOSZR5I0u0YWClV1J/DiAe1vmGaZtcDaUdUkSZqe32iWJHUMBUlSZ5THFBYkT02VtJDZU5AkdQwFSVLHUJAkdQwFSVLHUJAkdQwFSVLHUJAkdQwFSVLHUJAkdQwFSVLHUDgEvPSFpIXCax8dBMNA0kJjT0GS1DEUJEkdQ0GS1DEUJEkdQ0GS1BlZKCR5ZpJbkvxdks1J3t7aj01yY5IvtPtj+pa5NMnWJPcmOXNUtUmSBhtlT+Fx4BVV9b3ASuCsJC8DLgFuqqoVwE3tOUlOAdYApwJnAe9OctgI6xuKp51KWkxGFgrV81h7eni7FbAaWN/a1wPntMergWur6vGqug/YCpw2qvokSfsa6TGFJIcluQPYCdxYVZ8DTqiqHQDt/vg2+zLggb7Ft7W2vdd5QZJNSTbt2rVrlOVL0qIz0m80V9UeYGWSo4HrkrxomtkzaBUD1rkOWAcwMTGxz/RDZabDRpPzb9y48SmPJWk+mZWzj6rqYeBT9I4VPJhkKUC739lm2wac1LfYicD22ahPktQzyrOPlrQeAkmeBbwKuAe4HjivzXYe8PH2+HpgTZIjkpwMrABuGVV9kqR9jXL4aCmwvp1B9DRgQ1X9UZLPAhuSnA98GXgtQFVtTrIBuBvYDVzUhp8kSbNkZKFQVXcCLx7Q/jXglVMssxZYO6qaJEnT8xvNkqSOoSBJ6hgKkqSOoSBJ6hgKkqSOoSBJ6hgKkqSOoSBJ6hgKkqSOoSBJ6hgKkqSOoSBJ6hgKkqSOoSBJ6hgKIzTTn/SUpHEzFAbww1zSYmUoHGIGiqT5zFCQJHUMBUlSx1CQJHVGFgpJTkpyc5ItSTYneVNrvzzJV5Lc0W5n9y1zaZKtSe5NcuaoapMkDfb0Ea57N/CWqro9yVHAbUlubNPeUVW/1T9zklOANcCpwHOBTyZ5YVXtGWGNkqQ+I+spVNWOqrq9PX4U2AIsm2aR1cC1VfV4Vd0HbAVOG1V9kqR9zcoxhSTLgRcDn2tNFye5M8k1SY5pbcuAB/oW28aAEElyQZJNSTbt2rVrlGVL0qIz8lBIciTwEeDNVfUI8B7gBcBKYAdw5eSsAxavfRqq1lXVRFVNLFmyZDRFH0KrVq3yuwuS5o2RhkKSw+kFwvur6qMAVfVgVe2pqieA9/LkENE24KS+xU8Eto+yPknSU43y7KMAVwNbquqqvvalfbOdC9zVHl8PrElyRJKTgRXALaOqT5K0r1GeffRy4A3A55Pc0dreCrwuyUp6Q0P3AxcCVNXmJBuAu+mduXSRZx5J0uwaWShU1WcYfJzghmmWWQusHVVNkqTp+Y1mSVLHUJAkdQwFSVLHUJAkdQwFSVLHUJAkdQyFWeQlLyTNdYaCJKkzym80zzv+FS9psbOnMEsMHEnzgaEgSeoYCpKkjqEgSeoYCpKkjqEgSeoYCpKkjqEgSeoYCpKkjqEgSeoYCpKkzshCIclJSW5OsiXJ5iRvau3HJrkxyRfa/TF9y1yaZGuSe5OcOaraJEmDjbKnsBt4S1V9N/Ay4KIkpwCXADdV1QrgpvacNm0NcCpwFvDuJIeNsD5J0l6GCoUkJye5KslHk1w/eZtumaraUVW3t8ePAluAZcBqYH2bbT1wTnu8Gri2qh6vqvuArcBpM35F84gXyZM01wx76eyPAVcDG4EnZrqRJMuBFwOfA06oqh3QC44kx7fZlgF/07fYtta297ouAC4AeN7znjfTUiRJ0xg2FP65qn7nQDaQ5EjgI8Cbq+qRJFPOOqCt9mmoWgesA5iYmNhnuiTpwA0bCu9MchnwCeDxycbJ4aGpJDmcXiC8v6o+2pofTLK09RKWAjtb+zbgpL7FTwS2D1nfvDU5hLRx48YxVyJJw4fC9wBvAF7Bk8NH1Z4PlF6X4GpgS1Vd1TfpeuA84Ip2//G+9g8kuQp4LrACuGXI+uYVjyVImquGDYVzge+oqn+ZwbpfTi9IPp/kjtb2VnphsCHJ+cCXgdcCVNXmJBuAu+mduXRRVe2ZwfYkSQdp2FD4O+Bonhzq2a+q+gyDjxMAvHKKZdYCa4fdhiTp0Bo2FE4A7klyK089pvBjI6lKkjQWw4bCZSOtQpI0JwwVClX1F6MuRJI0fkOFQpJHefI7A88ADge+UVXfOqrCJEmzb9iewlH9z5OcwwK/BIUkLUYHdEG8qvoY03xHQZI0Pw07fPTjfU+fBkww4BIUkqT5bdizj/q/grsbuJ/eVU0lSQvIsMcU3jjqQiRJ4zdtKCT55WkmV1X96iGuR5I0RvvrKXxjQNuzgfOBbwMMBUlaQKYNhaq6cvJxkqOANwFvBK4FrpxqOUnS/LTfYwpJjgV+EXg9vZ/PfElVPTTqwiRJs29/xxR+E/hxer909j1V9disVCVJGov9fXntLfR+8OZtwPYkj7Tbo0keGX15kqTZtL9jCgf0jWdJ0vzkh37jT2RKkqEgSepjKEiSOoaCJKkzslBIck2SnUnu6mu7PMlXktzRbmf3Tbs0ydYk9yY5c1R1SZKmNsqewvuAswa0v6OqVrbbDQBJTgHWAKe2Zd6d5LAR1iZJGmBkoVBVnwa+PuTsq4Frq+rxqroP2Iq/7DY0z5ySdKiM45jCxUnubMNLx7S2ZcADffNsa237SHJBkk1JNu3atWvUtUrSojLbofAe4AXASmAHT15ULwPmHfjLblW1rqomqmpiyZIlIylyHPxrX9JcMKuhUFUPVtWeqnoCeC9PDhFtA07qm/VEYPts1iZJmuVQSLK07+m5wOSZSdcDa5IckeRkYAVwy2zWNpfYa5A0LsP+RvOMJfkgcDpwXJJtwGXA6UlW0hsauh+4EKCqNifZANxN7zegL6qqPaOqTZI02MhCoapeN6D56mnmXwusHVU9kqT98xvNkqSOoSBJ6hgKkqSOoSBJ6hgKc5ynp0qaTSM7+0gzZwBIGjd7CpKkzqLvKfjXuSQ9yZ6CJKljKEiSOoaCJKljKEiSOoaCJKmzqEPBM48k6akW/Smpc5WBJWkcFnVPQZL0VIaCJKljKMwDq1atcjhJ0qwwFCRJHUNBktQZWSgkuSbJziR39bUdm+TGJF9o98f0Tbs0ydYk9yY5c1R1SZKmNsqewvuAs/ZquwS4qapWADe15yQ5BVgDnNqWeXeSw0ZYmyRpgJGFQlV9Gvj6Xs2rgfXt8XrgnL72a6vq8aq6D9gKnDaq2iRJg832MYUTqmoHQLs/vrUvAx7om29ba9tHkguSbEqyadeuXSMtVpIWm7lyoDkD2mrQjFW1rqomqmpiyZIlIy5LkhaX2Q6FB5MsBWj3O1v7NuCkvvlOBLbPcm2StOjNdihcD5zXHp8HfLyvfU2SI5KcDKwAbpnl2iRp0RvZBfGSfBA4HTguyTbgMuAKYEOS84EvA68FqKrNSTYAdwO7gYuqas+oapMkDTayUKiq100x6ZVTzL8WWDuqeiRJ+zdXDjRrCF7/SNKoGQrzlAEhaRQMBUlSx1CQJHX8Oc55xmEjSaNkT0GS1DEUJEkdQ0GS1PGYwjzm8QVJh5o9BUlSx1CQJHUMBUlSx1CQJHUMBUlSx1CQJHUMhQXG01QlHQxDQZLU8ctrC4Q9BEmHgj0FSVLHUFjA7D1ImqmxDB8luR94FNgD7K6qiSTHAh8ClgP3Az9ZVQ+Noz5JWqzG2VM4o6pWVtVEe34JcFNVrQBuas8lSbNoLg0frQbWt8frgXPGV4okLU7jCoUCPpHktiQXtLYTqmoHQLs/ftCCSS5IsinJpl27ds1SuZK0OIzrlNSXV9X2JMcDNya5Z9gFq2odsA5gYmKiRlXgQtF/sHnjxo1jrETSfDCWnkJVbW/3O4HrgNOAB5MsBWj3O8dR20K2atUqz0iSNK1Z7ykkeTbwtKp6tD1+NfArwPXAecAV7f7js13bQjHMB//kPPYeJPUbx/DRCcB1SSa3/4Gq+tMktwIbkpwPfBl47Rhqk6RFbdZDoaq+CHzvgPavAa+c7XokSU+aS6ekSpLGzFCQJHUMBUlSx1CQJHUMhUXO7y5I6mcoCPAy25J6DAUNZA9CWpwMhUVoqg97g0CSoSBJ6hgK2oe9BWnxMhQkSR1DQUPr70HYm5AWJkNB09r7w9+D0dLCZihIkjrj+jlOzSPTncIK/lCPtJDYU9AhMdWw0kyGmxyaksbPUNBB8wC0tHA4fKRDzmCQ5i97CpoVk0NDew8R7e+SG1PNa+9EGg17Cpq3PAAuHXqpqnHX8BRJzgLeCRwG/H5VXTHVvBMTE7Vp06YD3pZ/Yc5NGzduPOB/m72XHRQMq1atGjowZjKvNF8kua2qJgZNm1PDR0kOA94FvAY4BXhdklPGW5Vm28GEtWc6SQdnrg0fnQZsraovAiS5FlgN3D3WqjRv7e+Dv78nMMxw1DBBMtX6+nsc++vNTLX8THpBe887k2G16WofNM9M1jndvFOtc7plp+vNjXoocRxDlaPe5lwLhWXAA33PtwHf3z9DkguAC9rTx5LcO8NtHAd89YArXBwW1T5KcqDzDdxPU61vpu37m28m6+tvG3Z7w2xryOlP2U8Hsb8PeB8eyGueiUO0/hn9vzvIbT5/qglzLRQGvcqnHPSoqnXAugPeQLJpqrE09biPhuN+Go77aThzZT/NqWMK9HoGJ/U9PxHYPqZaJGnRmWuhcCuwIsnJSZ4BrAGuH3NNkrRozKnho6raneRi4M/onZJ6TVVtPsSbOeChp0XEfTQc99Nw3E/DmRP7ac59T0GSND5zbfhIkjRGhoIkqbNoQiHJWUnuTbI1ySXjrmcuSXJ/ks8nuSPJptZ2bJIbk3yh3R8z7jpnW5JrkuxMcldf25T7Jcml7f11b5Izx1P17JtiP12e5CvtPXVHkrP7pi26/ZTkpCQ3J9mSZHOSN7X2Ofd+WhSh4OUzhnJGVa3sO0/6EuCmqloB3NSeLzbvA87aq23gfmnvpzXAqW2Zd7f33WLwPvbdTwDvaO+plVV1Ayzq/bQbeEtVfTfwMuCiti/m3PtpUYQCfZfPqKp/ASYvn6GprQbWt8frgXPGV8p4VNWnga/v1TzVflkNXFtVj1fVfcBWeu+7BW+K/TSVRbmfqmpHVd3eHj8KbKF3BYc5935aLKEw6PIZy8ZUy1xUwCeS3NYuIwJwQlXtgN4bGjh+bNXNLVPtF99j+7o4yZ1teGlyWGTR76cky4EXA59jDr6fFkso7PfyGYvcy6vqJfSG1y5K8sPjLmge8j32VO8BXgCsBHYAV7b2Rb2fkhwJfAR4c1U9Mt2sA9pmZT8tllDw8hnTqKrt7X4ncB29buqDSZYCtPud46twTplqv/ge61NVD1bVnqp6AngvTw59LNr9lORweoHw/qr6aGuec++nxRIKXj5jCkmeneSoycfAq4G76O2f89ps5wEfH0+Fc85U++V6YE2SI5KcDKwAbhlDfXPC5Addcy699xQs0v2U3iVNrwa2VNVVfZPm3PtpTl3mYlRm6fIZ89UJwHXtMrxPBz5QVX+a5FZgQ5LzgS8Drx1jjWOR5IPA6cBxSbYBlwFXMGC/VNXmJBvo/fbHbuCiqtozlsJn2RT76fQkK+kNedwPXAiLej+9HHgD8Pkkd7S2tzIH309e5kKS1Fksw0eSpCEYCpKkjqEgSeoYCpKkjqEgSeoYCloQknx7kmuT/H2Su5PckOSFB7iuNyf5lr7nNyQ5+hDUeHmS/3yw65lm/Sv3uhrpSLenhclQ0LzXvhh0HfCpqnpBVZ1C7xzwEw5wlW8GulCoqrOr6uGDrXMWrATO3t9M0nQMBS0EZwDfrKrfm2yoqjuq6i+THJnkpiS3t9+MWA29i5IluSfJ+nbRtg8n+ZYkvwA8F7g5yc1t3vuTHNce/2KSu9rtzX3r2pLkve1a+Z9I8qxhi0/yX5Lc2up4+/7WmeT72ryfTfKbrZZnAL8C/FR6v1/wU231pyT5VJIvttcmTctQ0ELwIuC2Kab9M3Buu+DfGcCVrWcB8F3Auqr618AjwM9X1e/Qu8bMGVV1Rv+KkrwUeCPw/fSuif+zSV7cJq8A3lVVpwIPA/9umMKTvLotexq9v/Rf2ndBwqnW+QfAz1XVDwB7ANol4X8Z+FD7/YIPtXn/FXBmW/9l7fo70pQMBS10AX49yZ3AJ+ldfnhyWOmBqvqr9vgPgR/cz7p+ELiuqr5RVY8BHwV+qE27r6ruaI9vA5YPWd+r2+1vgdvpfYivmGqd7djGUVX11639A/tZ/x+3a/J/ld7F1g50SE2LxKK49pEWvM3AT0wx7fXAEuClVfXNJPcDz2zT9r7Gy/6u+TLocsaTHu97vAcYdvgowG9U1f96SmPvmvuD1jldDcPU5f95TcueghaCPweOSPKzkw1t3P3fAM8BdrZAOAN4ft9yz0vyA+3x64DPtMePAkcN2M6ngXPasYdn07v6518eZO1/BvxMu84+SZYlmfIHjarqIeDRJC9rTWv6Jk9VtzQ0Q0HzXvWu6ngu8CPtlNTNwOX0jg28H5hIsoler+GevkW3AOe1oaVj6f0wDMA64E8mDzT3bed2er9HfAu9X836/ar62xmW+7Yk2yZvVfUJekNAn03yeeDD7P+D/XxgXZLP0us5/GNrv5negeX+A83SjHiVVC1KbXjmj6rqReOuZaaSHNmOaZDkEmBpVb1pzGVpgXB8UZp//m2SS+n9//0S8NPjLUcLiT0FSVLHYwqSpI6hIEnqGAqSpI6hIEnqGAqSpM7/B2zTljIa0wklAAAAAElFTkSuQmCC",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "plt.hist(captions_len, bins=200, color='black', alpha=0.7)\n",
    "plt.xlabel('Caption Length')\n",
    "plt.ylabel('Num')\n",
    "plt.title('Hist of Caption Length')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 计算超过 thresh 的占比\n",
    "def perce(captions_len, thresh=77):\n",
    "    cnt = 0\n",
    "    for caption_len in captions_len:\n",
    "        if caption_len > thresh:\n",
    "            cnt += 1\n",
    "    return cnt / len(captions_len)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "thresh: 50 -- 0.04929060665362035\n",
      "thresh: 77 -- 0.01259784735812133\n"
     ]
    }
   ],
   "source": [
    "for thresh in [50, 77]:\n",
    "    print(f\"thresh: {thresh} -- {perce(captions_len=captions_len, thresh=thresh)}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 进行 Tokenization 之后的 Caption Length"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "from open_clip import SimpleTokenizer, tokenize"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 8176/8176 [00:01<00:00, 4408.38it/s]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "8176"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "_tokenizer = SimpleTokenizer()\n",
    "caption_token_len = []\n",
    "\n",
    "for caption in tqdm(captions):\n",
    "    caption_token_len.append(len(_tokenizer.encode(caption)))\n",
    "\n",
    "len(caption_token_len)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYsAAAEWCAYAAACXGLsWAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8vihELAAAACXBIWXMAAAsTAAALEwEAmpwYAAAdY0lEQVR4nO3dfZRddX3v8ffHAEECSAKBBgKCEtDgA2Bu1IW1IMiDEgNVukLRFblUtHKtXK0axKtyb9NF26tXXYI1ApoWSowokljrJUSp1SuE8CghpITnaSKJKPJQjRA/94/9GzkMM7PPTGbPnJP5vNaadfb5nd/e5/s7hPnMfvod2SYiImIwLxjrAiIiovMlLCIiolbCIiIiaiUsIiKiVsIiIiJqJSwiIqJWwiLaImmNpKNH4X3+XNIjkp6UtGfT79dGPaMy7jbqOFCSJe0w1rWMBkkPSDpurOuIZyUsot//MSW9W9KPep/bPsz29TXb2aZfaJJ2BD4LHG97V9uP9tNnJ0mflnSPpKdK7ZdJOnA479ln21+T9Fetbe2Mexjvs6aE4ZOStkr6Tcvzj4/kew2jtud9Btvje8bQjYu/UqJr7APsDKwZpM9VwHTgT4FbgUnAO4FjgUubLnAk2D6sd1nS9cDlti8Zu4oi6mXPItrSuvchabak1ZIeL4eMPlu6/bA8Plb+Sn59P9uZKOlzkjaUn8+VtkOAdS3rf7+fdY8D3gzMtX2T7Wds/8r2RbYvLX3OlLRW0hOS7pP03pb1j5bUI+njkn5exnRGee1s4Azgo6X25f2Mu9/a+2z7w5I2Sdoo6cwhfsYvkPQJSQ+WbfyDpBcN0PftpbZXlPUWSLpX0qOSlkqaUvr17u3Nl/RQGff5Q6mr5T1PlnSbpMck/T9Jr2p57QFJfynpDkm/kvR1STu3vP7R8plskPRnpaaDB/rci8MH2l6MvoRFDMfngc/b3h14KbC0tL+xPO5RDiP9pJ91zwdeBxwOvBqYDXzC9r8Dh7Ws/6Z+1j0OWGX74UFq2wScDOwOnAn8H0lHtrz+B8BewH7AfGCRpENtLwKuAP621D6n3dr7bPtFZdtnARdJmjxIrX29u/wcA7wE2BX4Yt9OJYT+BjjO9p3AXwCnAH8E7Av8Erioz2pvAA6l2gP7pKSXD6Euymd4GfBeYE/gy8Cy3rAs/gQ4ETgIeFUZC5JOBD5E9d/v4FInADWfe7/bi7GRsIhe3y5/MT4m6THg4kH6Pg0cLGkv20/avmEI73MG8D9tb7K9GbgAeFeb6+4JbBysg+1/tn2vK/8KXAv8YZ9u/8P2lvL6P1P9UhqJ2p8urz9t+7vAk1S/oNt1BvBZ2/fZfhI4D5in554DOhf4CHC07fWl7b3A+bZ7bG8BPg28o896F9j+te3bgdupwm4o3gN82faNtrfaXgxsoQrPXl+wvcH2L4DlVKEK1ef7VdtrbP8n1efWjoG2F2MgYRG9TrG9R+8P8P5B+p4FHALcLekmSScP4X32BR5sef5gaWvHo8C0wTpIOknSDZJ+UULvLVR7Er1+afupYb5/Xe2P2n6m5fl/Uu0dtKu/7e9AdS6n10eAi2z3tLS9GLi6JejXAlv7rPezbair9z0+3OcPiv157vgHeo99gda9wcH2DFtta80xghIWMWS277F9OrA31eGQqyRNAtqZwngD1S+eXgeUtnZcB8yWNL2/F8shkW8C/xvYp4TedwG1dJtcau3v/evq35ba29Hf9p8BHmlpOx74hKS3t7Q9DJzUGva2d7b9HyNY28PAwj7vsYvtK9tYdyPVRQm99u/zeqa+7gIJixgySe+UNNX274DHSvNWYDPwO6rj7QO5kuqX3VRJewGfBC5v531tXwesoPor+jWSdpC0m6T3SfqvwE7AxFLHM5JOovrl2tcFqi7B/UOq8xvfKO2PNFV7m64E/rukgyTtCvw18PU+eytrqI7jXyTpbaXt74GFkl4MUOqbuw11TJC0c8vPTsBXgPdJeq0qkyS9VdJubWxvKXCmpJdL2oXqc2tV97lHB0hYxHCcCKyR9CTVye55tn9TjkcvBH5cDlW8rp91/wpYDdwB/BS4pbS16x1UewtfB34F3AnMAq6z/QTVyd6lVCd5/xRY1mf9n5XXNlCdWH2f7bvLa5cCM0vt326g9jqXAf9IdVXZ/cBvgA/07VTOO5wMfKUE4uepxnmtpCeAG4DXbkMdC4Bft/x83/ZqqvMWX6T6/NbT5gln2/8CfAH4QVmv98KHLeWx7nOPDqB8+VGMF6ruxL7cdr+HsWJ0lCux7gQm9tlrig6WPYuIaJykU8uhv8lU57mWJyi6S2NhIenQcgNP78/jks6VNEXSClXTNaxovQ5d0nmS1ktaJ+mEpmqLiFH3XqpzSfdSnd/687EtJ4ZqVA5DSZoA/AfVcdRzgF/YvlDSAmCy7Y9Jmkl1gm821aV21wGH2N7aeIERETGo0ToMdSxwr+0HgbnA4tK+mOrOU0r7knKz1P1UJ8Jmj1J9ERExiNGaSHAe1V4DVNe/bwSwvVHS3qV9P6qrOHr1lLbnKHPJnA0wadKk17zsZS9rrOiIiO3RzTff/HPbU4eyTuNhUa7RfhvV1AWDdu2n7XnHyMpcMosAZs2a5dWrV29zjRER44mkB+t7PddoHIY6CbjFdu9dqI9ImgZQHjeV9h6ee2fndEb27tiIiBim0QiL03n2EBRUNw/NL8vzgWta2uepmgb6IGAGsGoU6ouIiBqNHoYqt/a/meqyuV4XAkslnQU8BJwGYHuNpKXAXVTz4ZyTK6EiIjpDo2FRpn/Ys0/bo1RXR/XXfyHVdBEREdFBcgd3RETUSlhERESthEVERNRKWERERK2ERURE1EpY9GPOnDljXUJEREdJWERERK2ERURE1EpY1MghqYiIhEVERLQhYREREbUSFhERUSthERERtRIWERFRK2ERERG1Gv8O7m6VS2YjIp6VPYuIiKiVsIiIiFoJi4iIqJWwiIiIWgmLiIiolbCIiIhajYaFpD0kXSXpbklrJb1e0hRJKyTdUx4nt/Q/T9J6SeskndBkbRER0b6m77P4PPA92++QtBOwC/BxYKXtCyUtABYAH5M0E5gHHAbsC1wn6RDbWxuu8fdyb0VERP8a27OQtDvwRuBSANu/tf0YMBdYXLotBk4py3OBJba32L4fWA/Mbqq+iIhoX5OHoV4CbAa+KulWSZdImgTsY3sjQHncu/TfD3i4Zf2e0hYREWOsybDYATgS+JLtI4CnqA45DUT9tPl5naSzJa2WtHrz5s0jU2lERAyqybDoAXps31ieX0UVHo9ImgZQHje19N+/Zf3pwIa+G7W9yPYs27OmTp3aWPEREfGsxsLC9s+AhyUdWpqOBe4ClgHzS9t84JqyvAyYJ2mipIOAGcCqpuobijlz5uTkd0SMa01fDfUB4IpyJdR9wJlUAbVU0lnAQ8BpALbXSFpKFSjPAOeM5pVQERExsEbDwvZtwKx+Xjp2gP4LgYVN1hQREUOXO7gjIqJWwiIiImolLCIiolbCIiIiaiUsIiKiVsIiIiJqJSwiIqJWwiIiImolLCIiolbCIiIiaiUsIiKiVsIiIiJqJSwiIqJWwiIiImolLCIiolbCIiIiaiUsIiKiVsIiIiJqJSwiIqJWwiIiImolLCIiolbCIiIiajUaFpIekPRTSbdJWl3apkhaIeme8ji5pf95ktZLWifphCZri4iI9o3GnsUxtg+3Pas8XwCstD0DWFmeI2kmMA84DDgRuFjShFGoLyIiaozFYai5wOKyvBg4paV9ie0ttu8H1gOzR7+8iIjoq+mwMHCtpJslnV3a9rG9EaA87l3a9wMeblm3p7RFRMQY26Hh7R9le4OkvYEVku4epK/6afPzOlWhczbAAQccMDJVRkTEoBrds7C9oTxuAq6mOqz0iKRpAOVxU+neA+zfsvp0YEM/21xke5btWVOnTm2y/IiIKBoLC0mTJO3WuwwcD9wJLAPml27zgWvK8jJgnqSJkg4CZgCrmqovIiLa1+RhqH2AqyX1vs8/2f6epJuApZLOAh4CTgOwvUbSUuAu4BngHNtbG6wvIiLa1FhY2L4PeHU/7Y8Cxw6wzkJgYVM1RUTE8OQO7oiIqJWwiIiIWgmLiIiolbCIiIhaCYuIiKiVsCjmzJkz1iVERHSshMUQzZkzJ8ESEeNOwiIiImolLCIiolbCIiIiaiUsIiKiVsIiIiJqJSwiIqJWwiIiImolLCIiolbCIiIiaiUsIiKiVsIiIiJqJSyGKfNDRcR4krCIiIhaCYuIiKiVsIiIiFqNh4WkCZJulfSd8nyKpBWS7imPk1v6nidpvaR1kk5ouraIiGjPaOxZfBBY2/J8AbDS9gxgZXmOpJnAPOAw4ETgYkkTRqG+iIio0WhYSJoOvBW4pKV5LrC4LC8GTmlpX2J7i+37gfXA7Cbri4iI9jS9Z/E54KPA71ra9rG9EaA87l3a9wMebunXU9oiImKMNRYWkk4GNtm+ud1V+mlzP9s9W9JqSas3b968TTVGRER7mtyzOAp4m6QHgCXAmyRdDjwiaRpAedxU+vcA+7esPx3Y0HejthfZnmV71tSpUxssPyIiejUWFrbPsz3d9oFUJ66/b/udwDJgfuk2H7imLC8D5kmaKOkgYAawqqn6IiKifTuMwXteCCyVdBbwEHAagO01kpYCdwHPAOfY3joG9UVERB+jEha2rweuL8uPAscO0G8hsHA0aoqIiPblDu6IiKiVsIiIiFptHYYqJ5w/ABzYuo7ttzVTVkREdJJ2z1l8G7gUWM5zb7CLiIhxoN2w+I3tLzRaSRfq/QKk5cuXj3ElERHNajcsPi/pU8C1wJbeRtu3NFJVRER0lHbD4pXAu4A38exhKJfnERGxnWs3LE4FXmL7t00WExERnandS2dvB/ZosI6u0HuOIiJivGl3z2If4G5JN/Hccxa5dDYiYhxoNyw+1WgVERHR0doKC9v/2nQhERHRudq9g/sJnv0iop2AHYGnbO/eVGEREdE52jrBbXs327uXn52BtwNfbLa07pET3xGxvRvWRIK2v03usYiIGDfaPQz1xy1PXwDMop/vx46IiO1Tu1dDtR5neQZ4AJg74tVERERHavdqqDObLiQiIjrXoGEh6ZODvGzb/2uE64mIiA5Ut2fxVD9tk4CzgD2BhEVExDgwaFjY/kzvsqTdgA8CZwJLgM8MtF5ERGxfas9ZSJoCfAg4A1gMHGn7l00XFhERnWPQ+ywk/R1wE/AE8Erbn243KCTtLGmVpNslrZF0QWmfImmFpHvK4+SWdc6TtF7SOkknbMO4IiJiBNXdlPdhYF/gE8AGSY+XnyckPV6z7hbgTbZfDRwOnCjpdcACYKXtGcDK8hxJM4F5wGHAicDFkiYMc1wRETGCBg0L2y+w/cI+033s3vu8Zl3bfrI83bH8mOr+jMWlfTFwSlmeCyyxvcX2/cB6YPbwhhURESOp3ZvyhqXsGdwMHAxcZPtGSfvY3ghge6OkvUv3/YAbWlbvKW2NyrxOERH1hjU3VLtsb7V9ODAdmC3pFYN0V3+beF4n6WxJqyWt3rx58whVGhERg2k0LHrZfgy4nupcxCOSpgGUx02lWw+wf8tq04EN/Wxrke1ZtmdNnTq1ybIjIqJoLCwkTZW0R1l+IXAccDewDJhfus0HrinLy4B5kiZKOgiYAaxqqr6IiGhfk+cspgGLy3mLFwBLbX9H0k+ApZLOAh4CTgOwvUbSUuAuqskKz7G9tcH6RlTvuY/ly5ePcSURESOvsbCwfQdwRD/tjwLHDrDOQmBhUzVFRMTwjMo5i4iI6G4Ji4iIqJWwiIiIWgmLiIiolbCIiIhaCYuIiKiVsIiIiFoJi4iIqJWwiIiIWgmLiIiolbAYYfl+jIjYHiUsIiKiVsIiIiJqJSwiIqJWwiIiImolLCIiolbCIiIiaiUsIiKiVsIiIiJqJSwiIqJWwiIiImolLCIiolbCIiIiajUWFpL2l/QDSWslrZH0wdI+RdIKSfeUx8kt65wnab2kdZJOaKq2iIgYmib3LJ4BPmz75cDrgHMkzQQWACttzwBWlueU1+YBhwEnAhdLmtBgfY2ZM2dOZp+NiO1KY2Fhe6PtW8ryE8BaYD9gLrC4dFsMnFKW5wJLbG+xfT+wHpjdVH0REdG+UTlnIelA4AjgRmAf2xuhChRg79JtP+DhltV6SlvfbZ0tabWk1Zs3b2607oiIqDQeFpJ2Bb4JnGv78cG69tPm5zXYi2zPsj1r6tSpI1VmREQMotGwkLQjVVBcYftbpfkRSdPK69OATaW9B9i/ZfXpwIYm64uIiPY0eTWUgEuBtbY/2/LSMmB+WZ4PXNPSPk/SREkHATOAVU3VFxER7duhwW0fBbwL+Kmk20rbx4ELgaWSzgIeAk4DsL1G0lLgLqorqc6xvbXB+iIiok2NhYXtH9H/eQiAYwdYZyGwsKmaIiJieHIHd0RE1EpYRERErYRFRETUSlhERESthMUoyVxREdHNEhYREVGryfssguxRRMT2IXsWERFRK2ERERG1EhYREVErYREREbUSFhERUSthERERtRIWERFRK2HRoNxjERHbi4RFRETUSlhERESthEVERNRKWIyiOXPm5DxGRHSlhEVERNRKWERERK2ERURE1GosLCRdJmmTpDtb2qZIWiHpnvI4ueW18yStl7RO0glN1RUREUPX5J7F14AT+7QtAFbangGsLM+RNBOYBxxW1rlY0oQGa4uIiCFo7JvybP9Q0oF9mucCR5flxcD1wMdK+xLbW4D7Ja0HZgM/aaq+sdR6RdTy5cvHsJKIiPaM9jmLfWxvBCiPe5f2/YCHW/r1lLbnkXS2pNWSVm/evLnRYiMiotIpJ7jVT5v762h7ke1ZtmdNnTq14bIiIgJGPywekTQNoDxuKu09wP4t/aYDG0a5toiIGMBoh8UyYH5Zng9c09I+T9JESQcBM4BVTReTu6kjItrT2AluSVdSnczeS1IP8CngQmCppLOAh4DTAGyvkbQUuAt4BjjH9tamaouIiKFp8mqo0wd46dgB+i8EFjZVT0REDF+nnOCOiIgOlrCIiIhaCYsxlmnLI6IbJCwiIqJWwqJDZO8iIjpZwiIiImolLDpQ9jIiotMkLCIiolbCIiIiaiUsIiKiVsIiIiJqJSwiIqJWYxMJxtC1XgXVu5yvXY2ITpA9iy6QKUEiYqwlLLpIAiMixkrCIiIiaiUsOlzfvYkckoqIsZCw2E4kRCKiSbkaqku1BkOumIqIpmXPYjuTvYuIaEL2LLYD/Z3XgGqPI3sgETESsmexHcteRkSMlI4LC0knSlonab2kBWNdz/akv5PgOTEeEe3oqLCQNAG4CDgJmAmcLmnm2Fa1/ekNiIFCoonwSCBFdLdOO2cxG1hv+z4ASUuAucBdY1rVODDQeQ949lxHO7/wBztP0k77cF7r29Za70C1D+X8zVDn6cq8XtGksfr31WlhsR/wcMvzHuC1rR0knQ2cXZ4+KWndMN5nL+Dnw6qwczU2JknD7jvQuoNts+W1542pv/UGe8/hvH8bdW1L//zb6w4dP6Zh/BtuHdOLh7pyp4VFf6P3c57Yi4BF2/Qm0mrbs7ZlG50mY+oOGVN3yJier6POWVDtSezf8nw6sGGMaomIiKLTwuImYIakgyTtBMwDlo1xTRER415HHYay/Yyk/wb8X2ACcJntNQ281TYdxupQGVN3yJi6Q8bUh2zX94qIiHGt0w5DRUREB0pYRERErXEXFt06nYikyyRtknRnS9sUSSsk3VMeJ7e8dl4Z4zpJJ4xN1QOTtL+kH0haK2mNpA+W9m4e086SVkm6vYzpgtLetWPqJWmCpFslfac87+oxSXpA0k8l3SZpdWnr9jHtIekqSXeX/69eP6Jjsj1ufqhOmt8LvATYCbgdmDnWdbVZ+xuBI4E7W9r+FlhQlhcAf1OWZ5axTQQOKmOeMNZj6DOeacCRZXk34N9L3d08JgG7luUdgRuB13XzmFrG9iHgn4DvdPu/vVLnA8Befdq6fUyLgT8ryzsBe4zkmMbbnsXvpxOx/VugdzqRjmf7h8Av+jTPpfoHQnk8paV9ie0ttu8H1lONvWPY3mj7lrL8BLCW6g7+bh6TbT9Znu5YfkwXjwlA0nTgrcAlLc1dPaYBdO2YJO1O9QflpQC2f2v7MUZwTOMtLPqbTmS/MaplJOxjeyNUv3yBvUt7V41T0oHAEVR/iXf1mMrhmtuATcAK210/JuBzwEeB37W0dfuYDFwr6eYyhRB095heAmwGvloOF14iaRIjOKbxFha104lsJ7pmnJJ2Bb4JnGv78cG69tPWcWOyvdX24VSzD8yW9IpBunf8mCSdDGyyfXO7q/TT1lFjKo6yfSTVDNfnSHrjIH27YUw7UB2m/pLtI4CnqA47DWTIYxpvYbG9TSfyiKRpAOVxU2nvinFK2pEqKK6w/a3S3NVj6lUOAVwPnEh3j+ko4G2SHqA6bPsmSZfT3WPC9obyuAm4muoQTDePqQfoKXuyAFdRhceIjWm8hcX2Np3IMmB+WZ4PXNPSPk/SREkHATOAVWNQ34Akier46lrbn215qZvHNFXSHmX5hcBxwN108Zhsn2d7uu0Dqf5/+b7td9LFY5I0SdJuvcvA8cCddPGYbP8MeFjSoaXpWKqvdhi5MY31GfwxuGLgLVRX3twLnD/W9Qyh7iuBjcDTVH8VnAXsCawE7imPU1r6n1/GuA44aazr72c8b6Da7b0DuK38vKXLx/Qq4NYypjuBT5b2rh1Tn/EdzbNXQ3XtmKiO799eftb0/h7o5jGVGg8HVpd/f98GJo/kmDLdR0RE1Bpvh6EiImIYEhYREVErYREREbUSFhERUSthERERtRIW0REk/YGkJZLulXSXpO9KOmSY2zpX0i4tz7/be//DNtR3fpmh9DZJW1uW/6Kfvu+W9MVteb9+tnl074yvTSgzlr5/tN4vuk/CIsZcuUHvauB62y+1PRP4OLDPMDd5LvD7sLD9Fld3VA+b7YW2D3c1lceve5dtf2FbtttB9gDeX9cpxq+ERXSCY4Cnbf99b4Pt22z/m6RdJa2UdEv5/oG5UE0+WObtXyzpjjKP/y7lL/19gR9I+kHp+4CkvcryhyTdWX7ObdnWWklfUfU9FNeWO7AHper7K75a6rpV0jH99HmrpJ9I2kvS8WX5FknfKPNi9dZ3QcsYX9buBzfUbZa7zFeU9i9LerB8NhcCLy17S39XNr+rnv1+hCtKqMc4lbCITvAKYKCJ6n4DnOpq0rdjgM+0/NI6FFhk+1XA48D7y1/6G4BjbD/nl7ek1wBnAq+l+p6J90g6orw8A7jI9mHAY8Db26j7HADbrwROBxZL2rnl/U6lmsztLaXpE8BxZSyrqb4jotfPS/uXgL9s470pv+SHus1PUU3ZcSTV3twBpX0BcG/ZW/pIaTuCai9tJtVdz0e1U1dsnxIW0ekE/LWkO4DrqKZR7j089bDtH5fly6mmEBnMG4CrbT/l6nsnvgX8YXntftu3leWbgQPbqO0NwD8C2L4beBDoPc9yDPAx4K22f0kVTjOBH6uawnw+8OKWbfVOpNjuezPMbb6BakJAbH8P+OUg219lu8f276imY2m3rtgO7TDWBURQzc/zjgFeOwOYCrzG9tOqZj/t/eu971w1dXPXDHYYZUvL8lag9jBUzfbuo/pr/BCqv/hF9f0Wp9e8/1ba//9yONscyqGkvp9Jfl+MY9mziE7wfWCipPf0Nkj6L5L+CHgR1fcpPF3OCbT+5XyApNeX5dOBH5XlJ6i+qrWvHwKnlHMbk4BTgX/bhrp/SBVmlCu3DqCalA2qvYw/Bv5B0mHADcBRkg4u/XcZ7tVeLYazzR8Bf1L6H0812RwM/JlFAAmL6ACuZrM8FXhzuXR2DfBpqnMPVwCzJK2m+sV8d8uqa4H55RDVFKpj8wCLgH/pPcHd8j63AF+jmor5RuAS27duQ+kXAxMk/RT4OvBu27//a9z2ulLzN4DdgXcDV5Z6bwDaPpFdHCupp/cHOHgY27wAOF7SLVRf/LMReML2o1SHs+5sOcEd8XuZdTa6kqqvYv2O7cG+iS76kDQR2Gr7mbJX9qVyOXDEoHIMMmJ8OQBYKukFwG+B99T0jwCyZxEREW3IOYuIiKiVsIiIiFoJi4iIqJWwiIiIWgmLiIio9f8BtmEf0E26JSoAAAAASUVORK5CYII=",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "plt.hist(caption_token_len, bins=200, color='black', alpha=0.7)\n",
    "plt.xlabel('Caption Token Length')\n",
    "plt.ylabel('Num')\n",
    "plt.title('Hist of Caption Token Length')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "thresh: 50 -- 0.16169275929549903\n",
      "thresh: 77 -- 0.05002446183953033\n",
      "thresh: 100 -- 0.022137964774951075\n"
     ]
    }
   ],
   "source": [
    "for thresh in [50, 77, 100]:\n",
    "    print(f\"thresh: {thresh} -- {perce(captions_len=caption_token_len, thresh=thresh)}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 统计 valid & train 数据集中的长度"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
    "def visualize_word_length(dataset='test'):\n",
    "    input_filepath = f\"/remote-home/share/medical/public/ROCO/{dataset}/radiology/processed_{dataset}.csv\"\n",
    "    df = pd.read_csv(input_filepath, sep=',')\n",
    "    captions = df[\"caption\"].tolist()\n",
    "    captions_len = [len(caption.split(' ')) for caption in captions]\n",
    "    print(f\"len(captions_len): {len(captions_len)}\")\n",
    "\n",
    "    plt.hist(captions_len, bins=200, color='black', alpha=0.7)\n",
    "    plt.xlabel('Caption Length')\n",
    "    plt.ylabel('Num')\n",
    "    plt.title(f'Caption Length in {dataset}set')\n",
    "    plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "len(captions_len): 8176\n"
     ]
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYUAAAEWCAYAAACJ0YulAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8vihELAAAACXBIWXMAAAsTAAALEwEAmpwYAAAcDUlEQVR4nO3df5xcdX3v8debGAICCpiQhiQYxNCaaF10m0KxVxAvidzuTegtNtTSSKnRNtxCQW+BWoHaVFoFbn1cwYZCyUPFkApIYtGCKVRpkbBJw48kpKQSYUmaLCAmsRrJ8rl/nO+enGxmdmc3e2Zmd97Px2Mec+Z7vud7PnMymc9+v+fM9ygiMDMzAzik0QGYmVnzcFIwM7Ock4KZmeWcFMzMLOekYGZmOScFMzPLOSlYU5P0RUl/2ug4yiLpIUm/V2PdX5W0qeyYrLU5KdigSfotSZ2SdkvaJumbkt4zDO1+WNLDxbKI+FhEfPpg266wr2skfXm42y1znxHx3Yj4+SHu+4BjO1SStkh6/zC0M2wx2fBxUrBBkXQZ8H+BvwAmAicANwFzGxiWmQ2XiPDDj5oewBuB3cB5/dSZBTwCvAJsA/4fcGhhfQB/CHwfeBH4LNkfJ28Dfgr0pH28kurfDvx5YfuPAJuBl4EVwPF92v4Y8AzwQ+ALgKrEeQ3w5SrrTgX+Nb2Hx4EzCuseAj4N/AuwC7gfGF9Y/zvAD4CXgD8FtgDvB+YAPwNeTe/v8Vra6xPXGUBX4fUW4OPAE8CPgDuBwypsV+3YjgM+BzwHbAe+CBye1o0HvpGOwcvAd9O/05eA14CfpLb+D3AY8OX0nl8BHgMmFj4zt6bPwgvAnwNjqsXkR+Mf7inYYJxG9gVwTz91eoA/IvtSOQ04C/iDPnXOBdqBd5H1MH43IjaSfaE/EhFHRsTRfRuW9D7gM8AHgUlkX77L+lT7NeCXgHemerNrf3sgaTLwD2RfXseSfeneJWlCodpvARcCxwGHpjpImkHWa/pQiu+NwGSAiPgWWe/qzvT+3jlQezX6IFnCORH4ReDDfSv0c2z/EjgZaAPemmL9VFp3OdAFTCDrEV6VNRUXkCWRjtTWXwEL0nudCrwp7esnqZ2lwN7U/inA2cDv1fLvbY3hpGCD8SbgxYjYW61CRKyJiO9FxN6I2AL8DfDePtX+MiJejojnyIaizq9x/x8CbouItRGxB7gSOE3StEKd6yLildT2g2RfeIPx28B9EXFfRLwWEQ8AncA5hTp/FxH/HhE/AZYX9vEbwMqIeDgifkb2BVvL5GLV2qvF5yNia0S8DKysdVtJIut1/VH6t9hFlrTmpyqvkiW2N0fEq5Gdz6j2Xl4l+2y8NSJ60mdgp6SJwAeASyPixxGxA7ixsA9rQq9rdAA2orwEjJf0umqJQdLJwA1kPYHXk33G1vSp9nxh+QfA8TXu/3hgbe+LiNgt6SWyv3C3pOL/LNT/L+DIGtvu9WbgPEkdhbKxZAmmV7V9HE/hvUXEf6X4BnIwMffdttZjOYHs32dNlh8AENnQDmTDetcA96f1SyLiuiptfYmsl7BM0tFkQ0l/QnYsxwLbCvs4hP3//a3JuKdgg/EI2TjwvH7q3Aw8DUyPiDeQDTuoT52pheUTgK1peaC/qreSfdEAIOkIsr9QXxgo8EF4HvhSRBxdeBzRzxdi0TZgSiG+w1N8vRo5JXHffb9INsQzs/A+3xgRRwJExK6IuDwi3gJ0AJdJOqtSW6kncW1EzAB+hWwI73fIjuUesnMkvft4Q0TMrBKTNQEnBatZRPyIbEjkC5LmSXq9pLGSPiDpr1K1o4CdwG5JvwD8foWmPiHpGElTgUvITpBCdrJziqRDq4RwB3ChpDZJ48iGOx5Nw1RDcYikwwqPcWR/5XZImi1pTCo/Q9KUgRoDvpa2/ZX0Hq5l/4S4HZgmqRH/7/Y7thHxGnALcKOk4yA7nyJpdlr+NUlvTcNMO8nOFfUU2npLb8OSzpT0DkljUt1XgZ6I2EZ24vx6SW+QdIikkyS9t9BOf//e1gBOCjYoEXEDcBnwSaCb7K/Bi4GvpyofJztxuovsS+fOA1vhXrIhpXVkJ3VvTeX/BKwH/lPSixX2vYrsip67yP4qP4mDG58+n+yv5d7Hf0TE82Qnv68qvL9PUMP/lYhYD/xvspPf28iOwQ6yv5YB/j49vyRp7YEtlKrSsf1jsiu5vidpJ/BtoPd3ENPT691kPcSbIuKhtO4zwCclvSLp48DPkSXEncBG4J/JkitkPYZDgQ1kV4R9jexcRbWYrMFU/dyR2fCTFGRDS5sbHUvZJB1Jdonm9Ih4tsHhmNXEPQWzYSSpIw2rHUH2G4An2XcS3KzpOSmYDa+5ZCfEt5INwczv51JOs6bj4SMzM8u5p2BmZrkR/eO18ePHx7Rp0xodhpnZiLJmzZoXI2JCpXUjOilMmzaNzs7ORodhZjaiSPpBtXUePjIzs5yTgpmZ5ZwUzMws56RgZmY5JwUzM8s5KZiZWc5JwczMck4KZmaWc1IwM7Ock8IQdXR0DFzJzGyEKS0ppNsYrpb0uKT1kq5N5ddIekHSuvQ4p7DNlZI2S9rUe1vAZuJEYGajXZlzH+0B3hcRuyWNBR6W9M207saI+FyxsqQZZLdWnAkcD3xb0skR0YOZmdVFaT2FyOxOL8emR383b5gLLIuIPenWhZuBWWXFZ2ZmByr1nIKkMZLWkd28/IGIeDStuljSE5Juk3RMKptMdpP0Xl2prG+bCyV1Surs7u4uM3wzs5ZTalKIiJ6IaAOmALMkvR24GTgJaAO2Aden6qrURIU2l0REe0S0T5hQcTrwYdfR0eHzCWbWEupy9VFEvAI8BMyJiO0pWbwG3MK+IaIuYGphsylk97k1M7M6KfPqowmSjk7LhwPvB56WNKlQ7VzgqbS8ApgvaZykE8luer66rPjMzOxAZV59NAlYKmkMWfJZHhHfkPQlSW1kQ0NbgI8CRMR6ScuBDcBeYJGvPDIzq6/SkkJEPAGcUqH8gn62WQwsLismMzPrn3/RbGZmOScFMzPLlXlOYVTypalmNpq5p2BmZjknBTMzyzkpmJlZzknBzMxyTgpmZpZzUjAzs5yTgpmZ5ZwUzMws56RgZmY5JwUzM8s5KQwDT31hZqOF5z46CE4GZjbauKdgZmY5JwUzM8s5KZiZWc5JwczMck4KZmaWKy0pSDpM0mpJj0taL+naVH6spAckPZOejylsc6WkzZI2SZpdVmxmZlZZmT2FPcD7IuKdQBswR9KpwBXAqoiYDqxKr5E0A5gPzATmADdJGlNifDXxZadm1kpKSwqR2Z1ejk2PAOYCS1P5UmBeWp4LLIuIPRHxLLAZmFVWfGZmdqBSzylIGiNpHbADeCAiHgUmRsQ2gPR8XKo+GXi+sHlXKuvb5kJJnZI6u7u7ywzfzKzllPqL5ojoAdokHQ3cI+nt/VRXpSYqtLkEWALQ3t5+wPrhMthho976K1eu3G/ZzGwkqcvVRxHxCvAQ2bmC7ZImAaTnHalaFzC1sNkUYGs94jMzs0yZVx9NSD0EJB0OvB94GlgBLEjVFgD3puUVwHxJ4ySdCEwHVpcVn5mZHajM4aNJwNJ0BdEhwPKI+IakR4Dlki4CngPOA4iI9ZKWAxuAvcCiNPxkZmZ1UlpSiIgngFMqlL8EnFVlm8XA4rJiMjOz/vkXzWZmlnNSMDOznJOCmZnlnBTMzCznpGBmZjknBTMzyzkpmJlZzknBzMxyTgpmZpZzUjAzs5yTgpmZ5ZwUzMws56RgZmY5JwUzM8s5KZRosLf0NDNrNCeFCvxlbmatyklhmDmhmNlI5qRgZmY5JwUzM8s5KZiZWa60pCBpqqQHJW2UtF7SJan8GkkvSFqXHucUtrlS0mZJmyTNLis2MzOr7HUltr0XuDwi1ko6Clgj6YG07saI+FyxsqQZwHxgJnA88G1JJ0dET4kxmplZQWk9hYjYFhFr0/IuYCMwuZ9N5gLLImJPRDwLbAZmlRWfmZkdqC7nFCRNA04BHk1FF0t6QtJtko5JZZOB5wubdVEhiUhaKKlTUmd3d3eZYZuZtZzSk4KkI4G7gEsjYidwM3AS0AZsA67vrVph8zigIGJJRLRHRPuECRPKCXoYdXR0+LcLZjZilJoUJI0lSwhfiYi7ASJie0T0RMRrwC3sGyLqAqYWNp8CbC0zPjMz21+ZVx8JuBXYGBE3FMonFaqdCzyVllcA8yWNk3QiMB1YXVZ8ZmZ2oDKvPjoduAB4UtK6VHYVcL6kNrKhoS3ARwEiYr2k5cAGsiuXFvnKIzOz+iotKUTEw1Q+T3BfP9ssBhaXFZOZmfXPv2g2M7Ock4KZmeWcFMzMLOekYGZmOScFMzPLOSmYmVnOSaGOPOWFmTU7JwUzM8uV+YvmEcd/xZtZq3NPoU6ccMxsJHBSMDOznJOCmZnlnBTMzCznpGBmZjknBTMzyzkpmJlZzknBzMxyTgpmZpZzUjAzs5yTgpmZ5UpLCpKmSnpQ0kZJ6yVdksqPlfSApGfS8zGFba6UtFnSJkmzy4rNzMwqK7OnsBe4PCLeBpwKLJI0A7gCWBUR04FV6TVp3XxgJjAHuEnSmBLjMzOzPmpKCpJOlHSDpLslreh99LdNRGyLiLVpeRewEZgMzAWWpmpLgXlpeS6wLCL2RMSzwGZg1qDf0QjiSfLMrNnUOnX214FbgZXAa4PdiaRpwCnAo8DEiNgGWeKQdFyqNhn4XmGzrlTWt62FwEKAE044YbChmJlZP2pNCj+NiM8PZQeSjgTuAi6NiJ2SqlatUBYHFEQsAZYAtLe3H7DezMyGrtak8NeSrgbuB/b0FvYOD1UjaSxZQvhKRNydirdLmpR6CZOAHam8C5ha2HwKsLXG+Eas3iGklStXNjgSM7Pak8I7gAuA97Fv+CjS64qUdQluBTZGxA2FVSuABcB16fneQvkdkm4AjgemA6trjG9E8bkEM2tWtSaFc4G3RMTPBtH26WSJ5ElJ61LZVWTJYLmki4DngPMAImK9pOXABrIrlxZFRM8g9mdmZgep1qTwOHA0+4Z6BhQRD1P5PAHAWVW2WQwsrnUfZmY2vGpNChOBpyU9xv7nFP5nKVGZmVlD1JoUri41CjMzawo1JYWI+OeyAzEzs8arKSlI2sW+3wwcCowFfhwRbygrMDMzq79aewpHFV9Lmscon4LCzKwVDWlCvIj4Ov38RsHMzEamWoePfr3w8hCgnQpTUJiZ2chW69VHxZ/g7gW2kM1qamZmo0it5xQuLDsQMzNrvH6TgqRP9bM6IuLTwxyPmZk10EA9hR9XKDsCuAh4E+CkYGY2ivSbFCLi+t5lSUcBlwAXAsuA66ttZ2ZmI9OA5xQkHQtcBnyI7PaZ74qIH5YdmJmZ1d9A5xQ+C/w62Z3O3hERu+sSlZmZNcRAP167nOyGN58EtkramR67JO0sPzwzM6ungc4pDOkXz2ZmNjL5Sz/xLTLNzJwUzMyswEnBzMxyTgpmZpYrLSlIuk3SDklPFcqukfSCpHXpcU5h3ZWSNkvaJGl2WXGZmVl1ZfYUbgfmVCi/MSLa0uM+AEkzgPnAzLTNTZLGlBibmZlVUFpSiIjvAC/XWH0usCwi9kTEs8BmfGe3mvnKKTMbLo04p3CxpCfS8NIxqWwy8HyhTlcqO4CkhZI6JXV2d3eXHauZWUupd1K4GTgJaAO2sW9SPVWoW/HObhGxJCLaI6J9woQJpQTZCP5r38yaQV2TQkRsj4ieiHgNuIV9Q0RdwNRC1SnA1nrGZmZmdU4KkiYVXp4L9F6ZtAKYL2mcpBOB6cDqesbWTNxrMLNGqfUezYMm6avAGcB4SV3A1cAZktrIhoa2AB8FiIj1kpYDG8juAb0oInrKis3MzCorLSlExPkVim/tp/5iYHFZ8ZiZ2cD8i2YzM8s5KZiZWc5JwczMck4KZmaWc1Jocr481czqqbSrj2zwnADMrNHcUzAzs1zL9xT817mZ2T7uKZiZWc5JwczMck4KZmaWc1IwM7Ock4KZmeVaOin4yiMzs/21/CWpzcoJy8waoaV7CmZmtj8nBTMzyzkpjAAdHR0eTjKzunBSMDOznJOCmZnlSksKkm6TtEPSU4WyYyU9IOmZ9HxMYd2VkjZL2iRpdllxmZlZdWX2FG4H5vQpuwJYFRHTgVXpNZJmAPOBmWmbmySNKTE2MzOroLSkEBHfAV7uUzwXWJqWlwLzCuXLImJPRDwLbAZmlRWbmZlVVu9zChMjYhtAej4ulU8Gni/U60plB5C0UFKnpM7u7u5SgzUzazXNcqJZFcqiUsWIWBIR7RHRPmHChJLDMjNrLfVOCtslTQJIzztSeRcwtVBvCrC1zrGZmbW8eieFFcCCtLwAuLdQPl/SOEknAtOB1XWOzcys5ZU2IZ6krwJnAOMldQFXA9cByyVdBDwHnAcQEeslLQc2AHuBRRHRU1ZsZmZWWWlJISLOr7LqrCr1FwOLy4rHzMwG1iwnmq0Gnv/IzMrmpDBCOUGYWRmcFMzMLOekYGZmOd+Oc4TxsJGZlck9BTMzyzkpmJlZzknBzMxyPqcwgvn8gpkNN/cUzMws56RgZmY5JwUzM8s5KZiZWc5JwczMck4KZmaWc1IYZXyZqpkdDCcFMzPL+cdro4R7CGY2HNxTMDOznJPCKObeg5kNVkOGjyRtAXYBPcDeiGiXdCxwJzAN2AJ8MCJ+2Ij4zMxaVSN7CmdGRFtEtKfXVwCrImI6sCq9NjOzOmqm4aO5wNK0vBSY17hQzMxaU6OSQgD3S1ojaWEqmxgR2wDS83GVNpS0UFKnpM7u7u46hWtm1hoadUnq6RGxVdJxwAOSnq51w4hYAiwBaG9vj7ICHC2KJ5tXrlzZwEjMbCRoSE8hIram5x3APcAsYLukSQDpeUcjYhvNOjo6fEWSmfWr7j0FSUcAh0TErrR8NvBnwApgAXBder633rGNFrV88ffWce/BzIoaMXw0EbhHUu/+74iIb0l6DFgu6SLgOeC8BsRmZtbS6p4UIuL7wDsrlL8EnFXveMzMbJ9muiTVzMwazEnBzMxyTgpmZpZzUjAzs5yTQovzbxfMrMhJwQBPs21mGScFq8g9CLPW5KTQgqp92TsRmJmTgpmZ5ZwU7ADuLZi1LicFMzPLOSlYzYo9CPcmzEYnJwXrV98vf5+MNhvdnBTMzCzXqNtx2gjS3yWs4Bv1mI0m7inYsKg2rDSY4SYPTZk1npOCHTSfgDYbPTx8ZMPOicFs5HJPweqid2io7xDRQFNuVKvr3olZOdxTsBHLJ8DNhp8iotEx7EfSHOCvgTHA30bEddXqtre3R2dn55D35b8wm9PKlSuH/G/Td9tKiaGjo6PmhDGYumYjhaQ1EdFeaV1TDR9JGgN8AfgAMAM4X9KMxkZl9XYwydpXOpkdnGYbPpoFbI6I7wNIWgbMBTY0NCobsQb64i/2BGoZjqolkVRrr9jjGKg3U237wfSC+tYdzLBaf7FXqjOYNvurW63N/rbtrzdX9lBiI4Yqy95nsyWFycDzhdddwC8XK0haCCxML3dL2jTIfYwHXhxyhK2hpY6RpKHWq3icqrU32PKB6g2mvWJZrfurZV81rt/vOB3E8R7yMRzKex6MYWp/UP/vDnKfb662otmSQqV3ud9Jj4hYAiwZ8g6kzmpjaZbxMaqNj1NtfJxq0yzHqanOKZD1DKYWXk8BtjYoFjOzltNsSeExYLqkEyUdCswHVjQ4JjOzltFUw0cRsVfSxcA/kl2SeltErB/m3Qx56KmF+BjVxsepNj5OtWmK49R0v1MwM7PGabbhIzMzayAnBTMzy7VMUpA0R9ImSZslXdHoeJqJpC2SnpS0TlJnKjtW0gOSnknPxzQ6znqTdJukHZKeKpRVPS6Srkyfr02SZjcm6vqrcpyukfRC+kytk3ROYV3LHSdJUyU9KGmjpPWSLknlTfd5aomk4OkzanJmRLQVrpO+AlgVEdOBVel1q7kdmNOnrOJxSZ+n+cDMtM1N6XPXCm7nwOMEcGP6TLVFxH3Q0sdpL3B5RLwNOBVYlI5F032eWiIpUJg+IyJ+BvROn2HVzQWWpuWlwLzGhdIYEfEd4OU+xdWOy1xgWUTsiYhngc1kn7tRr8pxqqYlj1NEbIuItWl5F7CRbAaHpvs8tUpSqDR9xuQGxdKMArhf0po0jQjAxIjYBtkHGjiuYdE1l2rHxZ+xA10s6Yk0vNQ7LNLyx0nSNOAU4FGa8PPUKklhwOkzWtzpEfEusuG1RZL+W6MDGoH8GdvfzcBJQBuwDbg+lbf0cZJ0JHAXcGlE7OyvaoWyuhynVkkKnj6jHxGxNT3vAO4h66ZulzQJID3vaFyETaXacfFnrCAitkdET0S8BtzCvqGPlj1OksaSJYSvRMTdqbjpPk+tkhQ8fUYVko6QdFTvMnA28BTZ8VmQqi0A7m1MhE2n2nFZAcyXNE7SicB0YHUD4msKvV90yblknylo0eOkbErTW4GNEXFDYVXTfZ6aapqLstRp+oyRaiJwT5qG93XAHRHxLUmPAcslXQQ8B5zXwBgbQtJXgTOA8ZK6gKuB66hwXCJivaTlZPf+2AssioiehgReZ1WO0xmS2siGPLYAH4WWPk6nAxcAT0pal8quogk/T57mwszMcq0yfGRmZjVwUjAzs5yTgpmZ5ZwUzMws56RgZmY5JwUbFST9nKRlkv5D0gZJ90k6eYhtXSrp9YXX90k6ehhivEbSxw+2nX7ab+szG2mp+7PRyUnBRrz0w6B7gIci4qSImEF2DfjEITZ5KZAnhYg4JyJeOdg466ANOGegSmb9cVKw0eBM4NWI+GJvQUSsi4jvSjpS0ipJa9M9I+ZCNimZpKclLU2Ttn1N0usl/SFwPPCgpAdT3S2SxqflyyQ9lR6XFtraKOmWNFf+/ZIOrzV4SZ+Q9FiK49qB2pT0S6nuI5I+m2I5FPgz4DeV3b/gN1PzMyQ9JOn76b2Z9ctJwUaDtwNrqqz7KXBumvDvTOD61LMA+HlgSUT8IrAT+IOI+DzZHDNnRsSZxYYkvRu4EPhlsjnxPyLplLR6OvCFiJgJvAL8r1oCl3R22nYW2V/67y5MSFitzb8DPhYRpwE9AGlK+E8Bd6b7F9yZ6v4CMDu1f3Waf8esKicFG+0E/IWkJ4Bvk00/3Dus9HxE/Eta/jLwngHaeg9wT0T8OCJ2A3cDv5rWPRsR69LyGmBajfGdnR7/Bqwl+xKfXq3NdG7jqIj411R+xwDt/0Oak/9FssnWhjqkZi2iJeY+slFvPfAbVdZ9CJgAvDsiXpW0BTgsres7x8tAc75Ums64157Ccg9Q6/CRgM9ExN/sV5jNuV+pzf5iqCUu/5+3frmnYKPBPwHjJH2ktyCNu78XeCOwIyWEM4E3F7Y7QdJpafl84OG0vAs4qsJ+vgPMS+cejiCb/fO7Bxn7PwK/m+bZR9JkSVVvaBQRPwR2STo1Fc0vrK4Wt1nNnBRsxItsVsdzgf+eLkldD1xDdm7gK0C7pE6yXsPThU03AgvS0NKxZDeGAVgCfLP3RHNhP2vJ7ke8muyuWX8bEf82yHA/Kamr9xER95MNAT0i6Ungawz8xX4RsETSI2Q9hx+l8gfJTiwXTzSbDYpnSbWWlIZnvhERb290LIMl6ch0TgNJVwCTIuKSBodlo4THF81Gnv8h6Uqy/78/AD7c2HBsNHFPwczMcj6nYGZmOScFMzPLOSmYmVnOScHMzHJOCmZmlvv/vQ1VIs4Lm6QAAAAASUVORK5CYII=",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "visualize_word_length(dataset='test')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "len(captions_len): 8175\n"
     ]
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYUAAAEWCAYAAACJ0YulAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8vihELAAAACXBIWXMAAAsTAAALEwEAmpwYAAAcuElEQVR4nO3de5gcdZ3v8ffHAEEBhUhgQ5Il6EZdcDViRD26rogCso5Bj2iQ9UTkyPEsKhxvD7he180u6uqjHkWNouZZwZiDIgFvsBG8LYLhTghZotyyCckAIhc1QvycP+o3RWcyPdOZpKZ7Mp/X8/TT1b/6VfW3ay6frqruX8k2ERERAI/pdgEREdE7EgoREVFLKERERC2hEBERtYRCRETUEgoREVFLKETPkfQFSe/vdh1NkXSZpP/ZYd+/lrS66ZqGeN66RkknSLq4k74x/iUUoiOSXi9phaQHJa2X9H1JL9wB632jpJ+1ttl+i+2PbO+6h3iuD0n6+o5eb5PPafuntp+6I2saRQ3n2D6yyeeQ9DVJ/9Tkc0RnEgoxIknvAD4F/DOwP/DnwFnAvC6WFRENSCjEsCQ9AfhH4BTb37b9kO2HbV9o+92lz2GSLpd0X9mL+Kyk3VrWYUlvl/RrSXdL+rikx0j6S+ALwPPLHsh9pf8W7xolvVnSGkn3Slom6YBB636LpFsk/UbS5yRpFK/zeZL+o7yG6yS9uGXeZZI+Iunnkh6QdLGkfVvm/w9Jt0u6R9L7Jd0m6aWSjgbeC7yuvL7rWp7ywHbrG1TXiyWtbXl8m6R3Sbpe0m8lfVPS7kMsN7m8lqe3tE2V9HtJ+0naR9JFkvrLdrtI0ow2NWyxNyfpZZJuLs//WUAt8/5C0o/LvLslfbNl3tMkXVJ+jqslvba0nwycALynbKcLh6ojxkZCIUbyfGB34Pxh+mwG/g+wb+l/BPD3g/q8CpgLHEq1h/Em26uAtwCX297T9t6DVyzpJcC/AK8FpgG3A0sGdXsF8BzgmaXfUZ2/PJA0Hfgu8E/AFOBdwLckTW3p9nrgRGA/YLfSB0kHU+01nVDqewIwHcD2D6j2rr5ZXt8zR1pfh14LHA0cBDwDeOPgDrY3Ad8Gjh+03I9tb6T62/8qcCDVnt/vgc+O9MQlvL4FvI/q5/0r4AUtXT4CXAzsA8wA/m9Zbg/gEuBcqtd8PHCWpENsLwLOAT5WtlNfJxshmpFQiJE8Ebjb9iPtOti+yvYvbD9i+zbgi8DfDOr2Udv32r6D6lDU8XTmBOArtq8u/+jOoNqzmNXS50zb95V1XwrM6XDdA/4O+J7t79n+k+1LgBXAMS19vmr7P23/Hlja8hyvAS60/TPbfwQ+AHQyoFi79XXiM7bX2b4XuHCYZc9ly+38+tKG7Xtsf8v272w/ACxk65/ZUI4BbrJ9nu2HqX6Wd7XMf5gqaA6w/QfbA3sYrwBus/3V8ntyNVW4vKaD54wxlFCIkdwD7Ctpl3YdJD2lHH64S9L9VO+OBx8OubNl+nbgADpzQOkPgO0HS03TW/q0/lP6HbBnh+secCBwXDnccl85jPVCqnf+Iz3HAbS8Ntu/K/WNZHtq7nTZHwGPlfRcSQdShcf5AJIeJ+mL5bDX/cBPgL0lTRrhuQe/XrPlz/Y9VIeTrpS0UtKbSvuBwHMHbeMTgD8b+eXGWGr7hx5RXA78ATgWOK9Nn88D1wDH235A0mls/Q5wJrCyTP85sK5Mj/Sueh3VPxSgPgzxROC/Oiu/I3cC/2b7zaNYdj1QfzpI0mOp6hvQtWGIbf9J0lKqvYUNwEVlrwDgnVR1P9f2XZLmUP0MRzofs57qZwlAOX9TP7Z9F/DmMu+FwL9L+gnVNv6x7Ze1K3cbX140JHsKMSzbv6U6JPI5SceWd5i7Snq5pI+VbnsB9wMPSnoa8L+HWNW7y8nNmcCpwMAJyA3ADLWcmB7kXOBESXMkTabaC7miHKYajcdI2r3lNhn4OtAn6ShJk0r7i9udeB3kvLLsfyuv4cNs+Y91AzBLUrf+1s4FXkf1rvzclva9qM4j3CdpCvDBDtf3XeAQSa8ue49vp+XdvqTjWrbbb6j+2W8GLgKeIukN5fdnV0nPUfVhA6i205NG9xJjR0ooxIhsfxJ4B9XJxX6qd31vBb5TuryL6nj1A8CXePQffqsLgKuAa6n+sZxd2n9EtQdxl6S7h3ju5cD7qY4/rweeDMzfjpdzPNU/w4Hbr2zfSXXy+70tr+/ddPD3YXsl8Daqk9/rqbbBRmBT6fL/yv09kq7ejrpHxfYVwENUh32+3zLrU8BjgbuBXwA/6HB9dwPHAWdSHSabDfy8pctzgCskPQgsA061fWvZQzmS6me3juoQ2EeByWW5s4GDy6Gl72zzC40dRrnITjRNkoHZttd0u5amSdoTuI/q9d7a5XIitln2FCK2k6S+clhtD+BfgRuA27pbVcToJBQitt88qkMi66gOp8x3dsFjnMrho4iIqGVPISIiauP6ewr77ruvZ82a1e0yIiLGlauuuupu21OHmjeuQ2HWrFmsWLGi22VERIwrkm5vNy+HjyIiopZQiIiIWkIhIiJqCYWIiKglFCIiopZQiIiIWkIhIiJqjYWCpKdKurbldr+k0yRNKRfvvqXc79OyzBmqLtC+WtI2XWc3IiK2X2OhYHu17Tm25wDPprps4PnA6cBy27OB5eXxwAXQ5wOHUF2U/KwOLg0YERE70FgdPjqC6mImt1ONKLm4tC+muswjpX2J7U1lHPo1wGFjVN9W+vr66Ovr69bTR0R0xViFwnzgG2V6f9vrAcr9fqV9OlteAHwtW16cHQBJJ0taIWlFf39/gyVHREw8jYdCuW7tK3n0soRtuw7RttW43rYX2Z5re+7UqUOO5xQREaM0FnsKLweutr2hPN4gaRpAud9Y2tcCM1uWm0F10ZKIiBgjYxEKx/PooSOoLua9oEwvoLqg+0D7fEmTJR1EdQWrK8egvoiIKBodOlvS44CXAf+rpflMYKmkk4A7gOMAbK+UtBS4CXgEOMX25ibri4iILTW6p2D7d7afaPu3LW332D7C9uxyf2/LvIW2n2z7qba/32RtnconkCJiIsk3miMiopZQGEL2DiJiokooRERELaEQERG1hEJERNQSChERUUsoRERELaEQERG1hEJERNQSChERUUsoRERELaEQERG1hEJERNQSChERUUsoRERELaEQERG1hEJERNQSChERUUsoRERELaEQERG1RkNB0t6SzpN0s6RVkp4vaYqkSyTdUu73ael/hqQ1klZLOqrJ2iIiYmtN7yl8GviB7acBzwRWAacDy23PBpaXx0g6GJgPHAIcDZwlaVLD9UVERIvGQkHS44EXAWcD2P6j7fuAecDi0m0xcGyZngcssb3J9q3AGuCwpuobjb6+vm6XEBHRqCb3FJ4E9ANflXSNpC9L2gPY3/Z6gHK/X+k/HbizZfm1pW0Lkk6WtELSiv7+/gbLj4iYeJoMhV2AQ4HP234W8BDlUFEbGqLNWzXYi2zPtT136tSpO6bSiIgAmg2FtcBa21eUx+dRhcQGSdMAyv3Glv4zW5afAaxrsL6IiBiksVCwfRdwp6SnlqYjgJuAZcCC0rYAuKBMLwPmS5os6SBgNnBlU/VFRMTWdml4/W8DzpG0G/Br4ESqIFoq6STgDuA4ANsrJS2lCo5HgFNsb264voiIaNFoKNi+Fpg7xKwj2vRfCCxssqaIiGgv32iOiIhaQiEiImoJhYiIqCUUIiKillCIiIhaQmEb9fX1ZQykiNhpJRQiIqKWUIiIiFpCISIiagmFiIioJRQiIqKWUIiIiFpCISIiagmFiIioJRQiIqKWUIiIiFpCISIiagmFiIioJRRGKYPiRcTOKKEQERG1hEJERNQaDQVJt0m6QdK1klaUtimSLpF0S7nfp6X/GZLWSFot6agma4uIiK2NxZ7C4bbn2J5bHp8OLLc9G1heHiPpYGA+cAhwNHCWpEljUF9ERBTdOHw0D1hcphcDx7a0L7G9yfatwBrgsLEvLyJi4mo6FAxcLOkqSSeXtv1trwco9/uV9unAnS3Lri1tW5B0sqQVklb09/c3WHpExMSzS8Prf4HtdZL2Ay6RdPMwfTVEm7dqsBcBiwDmzp271fyIiBi9RvcUbK8r9xuB86kOB22QNA2g3G8s3dcCM1sWnwGsa7K+iIjYUmOhIGkPSXsNTANHAjcCy4AFpdsC4IIyvQyYL2mypIOA2cCVTdUXERFba/Lw0f7A+ZIGnudc2z+Q9EtgqaSTgDuA4wBsr5S0FLgJeAQ4xfbmBuuLiIhBGgsF278GnjlE+z3AEW2WWQgsbKqmiIgYXr7RHBERtYRCRETUEgoREVFLKERERC2hEBERtYRCkYvmREQkFCIiokVCISIiagmFiIioJRQiIqKWUIiIiFpCISIiagmFiIioJRQiIqKWUIiIiFpCISIiagmFiIioJRQiIqKWUIiIiFpCISIiao2HgqRJkq6RdFF5PEXSJZJuKff7tPQ9Q9IaSaslHdV0bRERsaWx2FM4FVjV8vh0YLnt2cDy8hhJBwPzgUOAo4GzJE0ag/oiIqJoNBQkzQD+FvhyS/M8YHGZXgwc29K+xPYm27cCa4DDmqwvIiK21PSewqeA9wB/amnb3/Z6gHK/X2mfDtzZ0m9taduCpJMlrZC0or+/v5GiIyImql066STpIOBtwKzWZWy/cphlXgFstH2VpBd38jRDtHmrBnsRsAhg7ty5W82PiIjR6ygUgO8AZwMXsuW7/uG8AHilpGOA3YHHS/o6sEHSNNvrJU0DNpb+a4GZLcvPANZ1+FwREbEDdHr46A+2P2P7Uts/HrgNt4DtM2zPsD2L6gTyj2z/HbAMWFC6LQAuKNPLgPmSJpc9k9nAldv6giIiYvQ63VP4tKQPAhcDmwYabV89iuc8E1gq6STgDuC4sq6VkpYCNwGPAKfY3jyK9UdExCh1Ggp/BbwBeAmPHj5yeTwi25cBl5Xpe4Aj2vRbCCzssKaIiNjBOg2FVwFPsv3HJouJiIju6vScwnXA3g3WERERPaDTUNgfuFnSDyUtG7g1Wdh40tfX1+0SIiJ2iE4PH32w0SoiIqIndBQKI338NCIidg6dfqP5AR79dvFuwK7AQ7Yf31RhEREx9jrdU9ir9bGkY8lgdRERO51RDYhn+zt0+B2FiIgYPzo9fPTqloePAeYyxGB1ERExvnX66aPWz1w+AtxGdf2DiIjYiXR6TuHEpguJiIjuGzYUJH1gmNm2/ZEdXE9ERHTRSHsKDw3RtgdwEvBEIKEQEbETGTYUbH9iYFrSXsCpwInAEuAT7ZaLiIjxacRzCpKmAO8ATgAWA4fa/k3ThUVExNgb6ZzCx4FXU10T+a9sPzgmVY1DA4PiXXjhhV2uJCJi9Eb68to7gQOA9wHrJN1fbg9Iur/58iIiYiyNdE5hVN943tlkaOyImCjyTz8iImoJhYiIqDUWCpJ2l3SlpOskrZT04dI+RdIlkm4p9/u0LHOGpDWSVks6qqnaIiJiaE3uKWwCXmL7mcAc4GhJzwNOB5bbng0sL4+RdDAwHzgEOBo4S9KkBuuLiIhBGgsFVwY+wrpruZlqIL3FpX0xcGyZngcssb3J9q3AGnLNhoiIMdXpKKmjUt7pXwX8BfA521dI2t/2egDb6yXtV7pPB37Rsvja0jZm8imjiJjoGj3RbHuz7TnADOAwSU8fpruGWsVWnaSTJa2QtKK/v38HVRoRETBGnz6yfR9wGdW5gg2SpgGU+42l21pgZstiM4B1Q6xrke25tudOnTq1ybIjIiacJj99NFXS3mX6scBLgZuBZcCC0m0BcEGZXgbMlzRZ0kHAbODKpuqLiIitNXlOYRqwuJxXeAyw1PZFki4Hlko6CbgDOA7A9kpJS4GbqK7udortzQ3WFxERgzQWCravB541RPs9wBFtllkILGyqpoiIGF6+0RwREbWEQkRE1BIKERFRSyhEREQtoRAREbWEQkRE1Bod+2g8yHhHERGPyp7CDpaQiYjxLKEQERG1hEJERNQSCg3KoaSIGG8SChERUUsoRERELaEQERG1hEJERNQSChERUZvw32jeHvl0UUTsbLKnEBERtYRCRETUEgoREVFLKERERK2xUJA0U9KlklZJWinp1NI+RdIlkm4p9/u0LHOGpDWSVks6qqnaIiJiaE3uKTwCvNP2XwLPA06RdDBwOrDc9mxgeXlMmTcfOAQ4GjhL0qQG64uIiEEaCwXb621fXaYfAFYB04F5wOLSbTFwbJmeByyxvcn2rcAa4LCm6ouIiK2NyTkFSbOAZwFXAPvbXg9VcAD7lW7TgTtbFltb2gav62RJKySt6O/vb7TuiIiJpvFQkLQn8C3gNNv3D9d1iDZv1WAvsj3X9typU6fuqDIjIoKGQ0HSrlSBcI7tb5fmDZKmlfnTgI2lfS0ws2XxGcC6JuuLiIgtNfnpIwFnA6tsf7Jl1jJgQZleAFzQ0j5f0mRJBwGzgSubqm+s9PX1ZTiMiBg3mhz76AXAG4AbJF1b2t4LnAkslXQScAdwHIDtlZKWAjdRfXLpFNubG6wvIiIGaSwUbP+Moc8TABzRZpmFwMKmaoqIiOHlG80REVFLKERERC2hEBERtYRCRETUEgoREVFLKERERC2hMIbyRbaI6HUJhYiIqCUUIiKillCIiIhaQiEiImoJhYiIqCUUIiKillCIiIhaQiEiImpNXmSn5zX1RbJ8QS0ixqvsKURERC2hEBERtYRCF+UwU0T0mgl9TqFbEgYR0asa21OQ9BVJGyXd2NI2RdIlkm4p9/u0zDtD0hpJqyUd1VRdERHRXpOHj74GHD2o7XRgue3ZwPLyGEkHA/OBQ8oyZ0ma1GBtERExhMZCwfZPgHsHNc8DFpfpxcCxLe1LbG+yfSuwBjisqdoiImJoY32ieX/b6wHK/X6lfTpwZ0u/taVtK5JOlrRC0or+/v5Gi42ImGh65dNHGqLNQ3W0vcj2XNtzp06d2nBZO85IJ5dz8jkiesFYh8IGSdMAyv3G0r4WmNnSbwawboxri4iY8MY6FJYBC8r0AuCClvb5kiZLOgiYDVw5xrVFREx4jX1PQdI3gBcD+0paC3wQOBNYKukk4A7gOADbKyUtBW4CHgFOsb25qdoiImJojYWC7ePbzDqiTf+FwMKm6omIiJH1yonmiIjoAQmFiIioJRQiIqKWUIiIiFpCoYf09fXlS2wR0VUJhYiIqCUUIiKillCIiIhaQqHH5RxDRIylhEJERNQSCj0oewcR0S0JhXEkYRERTUsoRERErbFRUqMznbz7zx5CRIyV7Cn0qJG+3ZygiIgmJBQiIqKWUBjHMlZSROxoCYVxJoeVIqJJOdG8k0gYRMSOkD2FnVAOK0XEaCUUdmIJhojYVj13+EjS0cCngUnAl22f2eWSet62nmO48MILmywnIsaxngoFSZOAzwEvA9YCv5S0zPZN3a1s59IuRAbConV+a4D09fUlUCJ2cj0VCsBhwBrbvwaQtASYByQUumRwELTb8xhoHzw9eF1DtXc6f6Q6h6orIrZNr4XCdODOlsdrgee2dpB0MnByefigpNWjeJ59gbtHVWF3jEm9krapfaj5ZXpf4O4dsb7t0eF68rvQrNTbrNHWe2C7Gb0WCkP9FXuLB/YiYNF2PYm0wvbc7VnHWEq9zRlPtULqbVrq7b1PH60FZrY8ngGs61ItERETTq+Fwi+B2ZIOkrQbMB9Y1uWaIiImjJ46fGT7EUlvBX5I9ZHUr9he2cBTbdfhpy5Ivc0ZT7VC6m3ahK9XtkfuFRERE0KvHT6KiIguSihERERtQoWCpKMlrZa0RtLp3a5nKJJuk3SDpGslrShtUyRdIumWcr9PF+v7iqSNkm5saWtbn6QzyvZeLemoHqn3Q5L+q2zjayUd0wv1Spop6VJJqyStlHRqae/J7TtMvb26fXeXdKWk60q9Hy7tvbp929Xb7Pa1PSFuVCeufwU8CdgNuA44uNt1DVHnbcC+g9o+Bpxepk8HPtrF+l4EHArcOFJ9wMFlO08GDirbf1IP1Psh4F1D9O1qvcA04NAyvRfwn6Wmnty+w9Tbq9tXwJ5lelfgCuB5Pbx929Xb6PadSHsK9RAatv8IDAyhMR7MAxaX6cXAsd0qxPZPgHsHNberbx6wxPYm27cCa6h+DmOmTb3tdLVe2+ttX12mHwBWUX3Lvye37zD1ttPtem37wfJw13Izvbt929Xbzg6pdyKFwlBDaAz3C9wtBi6WdFUZ0gNgf9vrofpDBPbrWnVDa1dfL2/zt0q6vhxeGjhc0DP1SpoFPIvq3WHPb99B9UKPbl9JkyRdC2wELrHd09u3Tb3Q4PadSKEw4hAaPeIFtg8FXg6cIulF3S5oO/TqNv888GRgDrAe+ERp74l6Je0JfAs4zfb9w3Udoq0X6u3Z7Wt7s+05VKMlHCbp6cN079V6G92+EykUxsUQGrbXlfuNwPlUu38bJE0DKPcbu1fhkNrV15Pb3PaG8sf2J+BLPLqL3fV6Je1K9Q/2HNvfLs09u32HqreXt+8A2/cBlwFH08Pbd0BrvU1v34kUCj0/hIakPSTtNTANHAncSFXngtJtAXBBdypsq119y4D5kiZLOgiYDVzZhfq2MPAPoHgV1TaGLtcrScDZwCrbn2yZ1ZPbt129Pbx9p0rau0w/FngpcDO9u32HrLfx7TtWZ9J74QYcQ/UJiV8B/9Dteoao70lUnx64Dlg5UCPwRGA5cEu5n9LFGr9Btcv6MNU7k5OGqw/4h7K9VwMv75F6/w24Abi+/CFN64V6gRdS7e5fD1xbbsf06vYdpt5e3b7PAK4pdd0IfKC09+r2bVdvo9s3w1xERERtIh0+ioiIESQUIiKillCIiIhaQiEiImoJhYiIqCUUYqcg6c8kLZH0K0k3SfqepKeMcl2nSXpcy+PvDXxefDtr/JCkd23veoZZ/5xBI2Y2+nyxc0ooxLhXvkR1PnCZ7SfbPhh4L7D/KFd5GlCHgu1jXH2jtNfNofqeQMSoJRRiZ3A48LDtLww02L7W9k8l7SlpuaSrVV2nYh5UA7hJulnS4jKw2HmSHifp7cABwKWSLi19b5O0b5l+h6Qby+20lnWtkvSlMu79xeUbqB2R9G5Jvyx1DIyZ33adkp5T+l4u6eOllt2AfwRep2qM/deV1R8s6TJJvy6vLWJYCYXYGTwduKrNvD8Ar3I1yODhwCfKngXAU4FFtp8B3A/8ve3PUI0Xc7jtw1tXJOnZwInAc6nGtX+zpGeV2bOBz9k+BLgP+O+dFC7pyLLsYVTv9J/dMghiu3V+FXiL7ecDmwFcDQf/AeCbtufY/mbp+zTgqLL+D5axiiLaSijEzk7AP0u6Hvh3qqGEBw4r3Wn752X661TDNgznhcD5th9yNc79t4G/LvNutX1tmb4KmNVhfUeW2zXA1VT/xGe3W2c5t7GX7f8o7eeOsP7vuhpf/26qgd5Ge0gtJohdul1AxA6wEnhNm3knAFOBZ9t+WNJtwO5l3uAxXkYa82WooYkHbGqZ3gx0evhIwL/Y/uIWjdX1CYZa53A1dFJX/uZjWNlTiJ3Bj4DJkt480FCOu/8N8ARgYwmEw4EDW5b7c0nPL9PHAz8r0w9QXV5ysJ8Ax5ZzD3tQjVD50+2s/YfAm8o1CZA0XVLbiyjZ/g3wgKTnlab5LbPb1R3RsYRCjHuuRnV8FfCy8pHUlVTXsV0HnAPMlbSCaq/h5pZFVwELyqGlKVQXLwFYBHx/4ERzy/NcDXyNajjiK4Av275mG8t9n6S1AzfbF1MdArpc0g3AeYz8j/0kYJGky6n2HH5b2i+lOrHceqI5YptklNSYkMrhmYtsD3flrZ4kac9yTgNJp1MNnXxql8uKnUSOL0aMP38r6Qyqv9/bgTd2t5zYmWRPISIiajmnEBERtYRCRETUEgoREVFLKERERC2hEBERtf8PMzp/Or4TOWYAAAAASUVORK5CYII=",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "visualize_word_length(dataset='valid')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "len(captions_len): 65419\n"
     ]
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYsAAAEWCAYAAACXGLsWAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8vihELAAAACXBIWXMAAAsTAAALEwEAmpwYAAAbQUlEQVR4nO3de7xldV3/8dfb4SoXARkIZ7Ahmy5AhTIRZhfREtQmsF/U2IXJqHlEVPozK7DyklF00V/x+ClFaoyZ4fw0AkxKQqksEgfkNlxiFIRpJmbEiMESYfz8/ljfE9vDOWftueyzz5l5PR+P/dhrf/d3rfXZ65yz32d919prp6qQJGkmTxt3AZKkuc+wkCT1MiwkSb0MC0lSL8NCktTLsJAk9TIsNG8k+aMkvz7uOkYlyXVJfmrIvt+Z5O5R1zTD+p+d5NEkC8ZVg2aXYaGdkuRHkqxtbxybklyd5Dt2wXJ/IsnHB9uq6meq6i07u+wp1vWmJO/d1csd5Tqr6h+r6ut3cN1P2bY7sP77q+rAqtq2M8uZya6oU7uOYaEdluS1wB8AvwUcCTwbeAdw+hjL0i7gHoOeoqq8edvuG/AM4FHgzBn6nARcDzwMbAL+L7DPwPMF/ALwGeBzwO/R/QPzjcAXgW1tHQ+3/pcCvzkw/08D64HPA1cCz5q07J8B7gH+A3g7kGnqfBPw3mmeOxn45/YabgFeOPDcdcBbgH8CtgIfAQ4feP4s4LPAQ8CvA/cB3wOcBnwJeLy9vluGWd6kul4IbBh4fB/wOuBW4D+B9wP7TTHfTNv2YuDDwBdanS8HPgU8AjwAvGlgOUvaNt6rr3ZgP+C9bTs8DHwSOHLg9+hd7ffj34DfBBZMV6e38d3cs9COej7dm8DlM/TZBvxv4PDW/8XAz07q8wpgGfA8uj2Sn6yqO+ne6K+vbqjjkMkLTvIi4LeBHwKOontTvmxSt+8DvhX4ltbv1OFfHiRZBPw13RvYYXRvxh9MsnCg248ArwKOAPZpfUhyLN1e1o+2+p4BLAKoqr+h2xt7f3t939K3vCH9EF0QHQN8M/ATkzv0bNsfAS4ADgI+ThcaZwGH0AXHOUnOmGH909W+ku71Hw08s63/v9tzq4EngK8Fngu8BPipYX4HNLsMC+2oZwKfq6onputQVTdW1b9U1RNVdR/wx8B3T+r2O1X1+aq6n25I65VDrv9HgXdX1U1V9RhwPvD8JEsG+lxYVQ+3ZX8MOGHIZU/4MeDDVfXhqvpyVV0DrAVeNtDnT6vqX6vqv4E1A+v4QeCqqvp4VX0JeAPdf+J9plveMC6qqo1V9Xngqu2cF+CKqvqn9lq/WFXXVdVt7fGtwF/w1J/fMLU/Tvf78rVVta39XjyS5EjgpcBrquoLVbUZ+D/Aiu2sW7Ngr3EXoHnrIeDwJHtNFxhJvg54G92ew9Ppft9unNTtgYHpzwLPGnL9zwJumnhQVY8meYjuv/f7WvO/D/T/L+DAIZc94auBM5MsH2jbmy54Jky3jmcx8Nqq6r9afX12pubJ8w67LScM/ixI8m3AhcDxdHsK+wL/bzvWP1H7n9HtVVyW5BC6Ialfpdu+ewObkkzM97TJdWhucM9CO+p6ujHlM2boczFwF7C0qg4GXg9kUp+jB6afDWxs033/hW+ke7MBIMkBdP+9/ltf4dvhAeDPquqQgdsBVXXhEPNuAhYP1Ld/q2/COC/3PN26J7e/j+5Y0NFV9Qzgj3jqz69/ZVWPV9Wbq+pY4NvphgfPotu+j9Ed25jYvgdX1XE9dWoMDAvtkKr6T7qhlbcnOSPJ05PsneSlSX63dTuI7uDoo0m+AThnikX9UpJDkxwNvJruwCzAg8DiJPtMU8L7gFclOSHJvnTHAD7Rhrt2xNOS7Ddw25fuP+DlSU5NsqC1vzDJ4r6FAR9o8357ew1v5ivfaB8EliQZx99g37adcBDw+ar6YpKT6I5JbLckpyT5pnaG1SN0w1LbqmoT3YHwtyY5OMnTkjwnycRQ17B1ahYYFtphVfU24LXArwFb6P5T/Dngr1qX19G9wWwF/oQng2DQFXRDUzfTHUx+V2v/KLAO+Pckn5ti3dfSnWH0Qbr/4p/Dzo11v5LuoOvE7dNV9QDdQffXD7y+X2KIv5uqWgf8PN1B901022Az3X/S8ORwzkNJbnrqEkZqxm074GeB30iyle4fgzU7uL6vogvPR4A7gb+nC2Lo9jD2Ae6gO2vtA3QnBGxPnZoFqXJPT+ORpOiGqNaPu5ZRS3Ig3WmjS6vq3jGXI2039yykEUmyvA3PHQD8PnAbTx58l+YVw0IandPpDsRvBJYCK8pdec1TDkNJknq5ZyFJ6rXbfijv8MMPryVLloy7DEmaV2688cbPVdXCye27bVgsWbKEtWvXjrsMSZpXknx2qnaHoSRJvQwLSVIvw0KS1MuwkCT1MiwkSb0MC0lSL8NCktTLsJAk9TIsJEm9DIsdsHz58v5OkrQbMSwkSb0MC0lSL8NCktTLsJAk9dptL1E+Ch7YlrSncs9CktTLsJAk9TIsJEm9DAtJUi/DQpLUy7CQJPUyLCRJvQwLSVIvw0KS1Muw2EHLly/3E92S9hhe7mMIhoKkPZ17FpKkXoaFJKnXSMMiyX1Jbktyc5K1re2wJNckuafdHzrQ//wk65PcneTUgfYT23LWJ7koSUZZtyTpK83GnsUpVXVCVS1rj88Drq2qpcC17TFJjgVWAMcBpwHvSLKgzXMxsApY2m6nzULdkqRmHMNQpwOr2/Rq4IyB9suq6rGquhdYD5yU5Cjg4Kq6vqoKeM/APJKkWTDqsCjgI0luTLKqtR1ZVZsA2v0RrX0R8MDAvBta26I2Pbn9KZKsSrI2ydotW7bswpchSXu2UZ86+4Kq2pjkCOCaJHfN0Heq4xA1Q/tTG6suAS4BWLZs2ZR9JEnbb6R7FlW1sd1vBi4HTgIebENLtPvNrfsG4OiB2RcDG1v74inaJUmzZGRhkeSAJAdNTAMvAW4HrgRWtm4rgSva9JXAiiT7JjmG7kD2DW2oamuSk9tZUGcNzCNJmgWjHIY6Eri8neW6F/C+qvqbJJ8E1iQ5G7gfOBOgqtYlWQPcATwBnFtV29qyzgEuBfYHrm43SdIsGVlYVNVngG+Zov0h4MXTzHMBcMEU7WuB43d1jZKk4fgJbklSL8NCktTLsNhJXpFW0p7AsJAk9TIsJEm9DAtJUi/DQpLUy7CQJPUyLCRJvQwLSVIvw0KS1MuwkCT1MiwkSb0MC0lSL8Oih9d+kiTDQpI0BMNCktTLsJAk9TIsJEm9DAtJUi/DQpLUy7CQJPUyLHaB5cuX+3kMSbs1w0KS1MuwkCT1MiwkSb0MC0lSL8NCktRr5GGRZEGSTyX5UHt8WJJrktzT7g8d6Ht+kvVJ7k5y6kD7iUlua89dlCSjrluS9KTZ2LN4NXDnwOPzgGurailwbXtMkmOBFcBxwGnAO5IsaPNcDKwClrbbabNQtySpGWlYJFkMvBx450Dz6cDqNr0aOGOg/bKqeqyq7gXWAyclOQo4uKqur6oC3jMwjyRpFox6z+IPgF8GvjzQdmRVbQJo90e09kXAAwP9NrS2RW16cvtTJFmVZG2StVu2bNklL0CSNMKwSPJ9wOaqunHYWaZoqxnan9pYdUlVLauqZQsXLhxytZKkPnuNcNkvAL4/ycuA/YCDk7wXeDDJUVW1qQ0xbW79NwBHD8y/GNjY2hdP0S5JmiUj27OoqvOranFVLaE7cP3Rqvox4EpgZeu2EriiTV8JrEiyb5Jj6A5k39CGqrYmObmdBXXWwDySpFkwyj2L6VwIrElyNnA/cCZAVa1Lsga4A3gCOLeqtrV5zgEuBfYHrm43SdIsmZWwqKrrgOva9EPAi6fpdwFwwRTta4HjR1ehJGkmfoJbktTLsJAk9TIsJEm9DAtJUi/DYhfyq1Ul7a4MC0lSL8NCktTLsJAk9TIsJEm9DAtJUi/DQpLUy7CQJPUyLCRJvQwLSVIvw0KS1MuwkCT1MiwkSb0MC0lSL8NCktTLsJAk9TIsJEm9DAtJUi/DQpLUy7CQJPUyLCRJvfYaplOSY4CfB5YMzlNV3z+asuav5cuXA3DVVVeNuRJJ2nWGCgvgr4B3AVcBXx5ZNZKkOWnYsPhiVV000kokSXPWsMcs/jDJG5M8P8nzJm4zzZBkvyQ3JLklybokb27thyW5Jsk97f7QgXnOT7I+yd1JTh1oPzHJbe25i5Jkh16tJGmHDLtn8U3AjwMv4slhqGqPp/MY8KKqejTJ3sDHk1wN/ABwbVVdmOQ84DzgV5IcC6wAjgOeBfxdkq+rqm3AxcAq4F+ADwOnAVdvx+uUJO2EYcPiFcDXVNWXhl1wVRXwaHu4d7sVcDrwwta+GrgO+JXWfllVPQbcm2Q9cFKS+4CDq+p6gCTvAc7AsJCkWTPsMNQtwCHbu/AkC5LcDGwGrqmqTwBHVtUmgHZ/ROu+CHhgYPYNrW1Rm57cPtX6ViVZm2Ttli1btrdcSdI0ht2zOBK4K8kn6YaXgP5TZ9sQ0glJDgEuT3L8DN2nOg5RM7RPtb5LgEsAli1bNmUfSdL2GzYs3rgzK6mqh5NcR3es4cEkR1XVpiRH0e11QLfHcPTAbIuBja198RTtIzXxeQlJ0pDDUFX191PdZponycK2R0GS/YHvAe4CrgRWtm4rgSva9JXAiiT7tg8BLgVuaENVW5Oc3M6COmtgHknSLBj2E9xbeXLoZx+6g9VfqKqDZ5jtKGB1kgV0obSmqj6U5HpgTZKzgfuBMwGqal2SNcAdwBPAuW0YC+Ac4FJgf7oD2x7clqRZNFRYVNVBg4+TnAGc1DPPrcBzp2h/CHjxNPNcAFwwRftaYKbjHZKkEdqhCwlW1V8x82csJEm7kWGHoX5g4OHTgGVMc0aSJGn3M+zZUIOnBj0B3Ef3ITpJ0h5g2GMWrxp1IZKkuWvGsEjyhhmerqp6yy6uR5I0B/XtWXxhirYDgLOBZwKGhSTtAWYMi6p668R0koOAVwOvAi4D3jrdfJKk3UvvqbPt+yd+E7iVLlyeV1W/UlWbe2bdo3m5EEm7k75jFr9H9/0TlwDfVFWPztRfkrR76tuz+EW6LyL6NWBjkkfabWuSR0ZfniRpLug7ZrFDn/CWJO1eDANJUi/DQpLUy7CQJPUyLCRJvQwLSVIvw0KS1MuwkCT1MiwkSb0MC0lSL8NCktTLsJAk9TIsJEm9DIsRWr58ud9rIWm3YFhIknoZFpKkXoaFJKmXYSFJ6jWysEhydJKPJbkzybokr27thyW5Jsk97f7QgXnOT7I+yd1JTh1oPzHJbe25i5JkVHVLkp5qlHsWTwC/WFXfCJwMnJvkWOA84NqqWgpc2x7TnlsBHAecBrwjyYK2rIuBVcDSdjtthHVLkiYZWVhU1aaquqlNbwXuBBYBpwOrW7fVwBlt+nTgsqp6rKruBdYDJyU5Cji4qq6vqgLeMzCPJGkWzMoxiyRLgOcCnwCOrKpN0AUKcETrtgh4YGC2Da1tUZue3D7VelYlWZtk7ZYtW3bpa5CkPdnIwyLJgcAHgddU1SMzdZ2irWZof2pj1SVVtayqli1cuHD7i5UkTWmkYZFkb7qg+POq+svW/GAbWqLdb27tG4CjB2ZfDGxs7YunaJ83/BS3pPlulGdDBXgXcGdVvW3gqSuBlW16JXDFQPuKJPsmOYbuQPYNbahqa5KT2zLPGphHkjQL9hrhsl8A/DhwW5KbW9vrgQuBNUnOBu4HzgSoqnVJ1gB30J1JdW5VbWvznQNcCuwPXN1ukqRZMrKwqKqPM/XxBoAXTzPPBcAFU7SvBY7fddVJkraHn+CegscYJOkrGRaSpF6GhSSpl2EhSeplWEiSehkWkqRehoUkqZdhMUuWL1/uKbmS5i3DQpLUy7CQJPUyLCRJvQwLSVIvw0KS1MuwkCT1MixmmafPSpqPDAtJUi/DQpLUy7CQJPUyLCRJvQwLSVIvw0KS1MuwkCT1MizGwMuVS5pvDAtJUi/DQpLUy7CQJPUyLCRJvQwLSVKvkYVFkncn2Zzk9oG2w5Jck+Sedn/owHPnJ1mf5O4kpw60n5jktvbcRUkyqpolSVMb5Z7FpcBpk9rOA66tqqXAte0xSY4FVgDHtXnekWRBm+diYBWwtN0mL3Pe8vRZSfPFyMKiqv4B+Pyk5tOB1W16NXDGQPtlVfVYVd0LrAdOSnIUcHBVXV9VBbxnYB5J0iyZ7WMWR1bVJoB2f0RrXwQ8MNBvQ2tb1KYnt08pyaoka5Os3bJlyy4tXJL2ZHPlAPdUxyFqhvYpVdUlVbWsqpYtXLhwlxUnSXu62Q6LB9vQEu1+c2vfABw90G8xsLG1L56iXZI0i2Y7LK4EVrbplcAVA+0rkuyb5Bi6A9k3tKGqrUlObmdBnTUwjyRpluw1qgUn+QvghcDhSTYAbwQuBNYkORu4HzgToKrWJVkD3AE8AZxbVdvaos6hO7Nqf+DqdpMkzaKRhUVVvXKap148Tf8LgAumaF8LHL8LS5Mkbae5coB7j+XlyiXNB4aFJKmXYTFHuHchaS4zLCRJvQwLSVIvw0KS1MuwmKM8S0rSXGJYSJJ6GRaSpF4j+wS3tp/DTpLmKvcsJEm9DAtJUi/DQpLUy7CQJPUyLCRJvQwLSVIvw2KO83RaSXOBYSFJ6mVYSJJ6GRbzgBcVlDRuhoUkqZfXhppHBvcurrrqqjFWImlP456FJKmXYTFPeRxD0mxyGGqec2hK0mxwz2I34p6GpFExLCRJvRyG2s1Mt3fhEJWknTFvwiLJacAfAguAd1bVhWMuaV4xRCTtjHkRFkkWAG8HvhfYAHwyyZVVdcd4K5v/DBFJw5gXYQGcBKyvqs8AJLkMOB0wLEZkVxwsnwic5cuXTxk+nsklzR/zJSwWAQ8MPN4AfNvkTklWAavaw0eT3L2d6zkc+NwOVThac7Gu3pqSTDnd13eUNY2BNQ1vLta1J9b01VM1zpewmOqdpJ7SUHUJcMkOryRZW1XLdnT+UZmLdVnTcKxpeHOxLmt60nw5dXYDcPTA48XAxjHVIkl7nPkSFp8EliY5Jsk+wArgyjHXJEl7jHkxDFVVTyT5OeBv6U6dfXdVrRvBqnZ4CGvE5mJd1jQcaxreXKzLmppUPWXoX5KkrzBfhqEkSWNkWEiSehkWTZLTktydZH2S88ZYx31Jbktyc5K1re2wJNckuafdHzriGt6dZHOS2wfapq0hyfltu92d5NRZrutNSf6tba+bk7xstupKcnSSjyW5M8m6JK9u7WPdVjPUNc5ttV+SG5Lc0mp6c2sf27aaoaaxbaeB9SxI8qkkH2qPx/73R1Xt8Te6g+afBr4G2Ae4BTh2TLXcBxw+qe13gfPa9HnA74y4hu8Cngfc3lcDcGzbXvsCx7TtuGAW63oT8Lop+o68LuAo4Hlt+iDgX9t6x7qtZqhrnNsqwIFtem/gE8DJ49xWM9Q0tu00sK7XAu8DPtQej/3vzz2Lzv9cTqSqvgRMXE5krjgdWN2mVwNnjHJlVfUPwOeHrOF04LKqeqyq7gXW023P2aprOiOvq6o2VdVNbXorcCfd1QbGuq1mqGs6s7GtqqoebQ/3brdijNtqhpqmMys/vySLgZcD75y07rH+/RkWnakuJzLTH9coFfCRJDe2y5cAHFlVm6B7IwCOGENd09UwF7bdzyW5tQ1TTeyez2pdSZYAz6X773TObKtJdcEYt1UbWrkZ2AxcU1Vj31bT1ATj/Z36A+CXgS8PtI39d8qw6Ax1OZFZ8oKqeh7wUuDcJN81pjqGNe5tdzHwHOAEYBPw1tY+a3UlORD4IPCaqnpkpq6zVRNMWddYt1VVbauqE+iuwHBSkuNn6D7Omsa2nZJ8H7C5qm4cdpYp2kbyO2VYdObM5USqamO73wxcTrdL+WCSowDa/eYxlDZdDWPddlX1YPuD/zLwJzy5Cz4rdSXZm+4N+c+r6i9b89i31VR1jXtbTaiqh4HrgNOYA9tqck1j3k4vAL4/yX10w+EvSvJe5sB2Miw6c+JyIkkOSHLQxDTwEuD2VsvK1m0lcMVs1zZDDVcCK5Lsm+QYYClww2wVNfEH1LyCbnvNSl1JArwLuLOq3jbw1Fi31XR1jXlbLUxySJveH/ge4C7GuK2mq2mc26mqzq+qxVW1hO596KNV9WPMhb+/URw1n4834GV0Z418GvjVMdXwNXRnNtwCrJuoA3gmcC1wT7s/bMR1/AXd7vfjdP+5nD1TDcCvtu12N/DSWa7rz4DbgFvp/nCOmq26gO+g2+W/Fbi53V427m01Q13j3FbfDHyqrft24A19v9tjrGls22lSfS/kybOhxv735+U+JEm9HIaSJPUyLCRJvQwLSVIvw0KS1MuwkCT1Miy0W0vyVUkuS/LpJHck+XCSr9vBZb0mydMHHn944jz9nazxTUlet7PLmWH5J0y6cupI16fdk2Gh3Vb7cNrlwHVV9ZyqOhZ4PXDkDi7yNcD/hEVVvay6T/7OdSfQfc5C2mGGhXZnpwCPV9UfTTRU1c1V9Y9JDkxybZKb0n1/yOnQXXgvyV1JVrcLyX0gydOT/ALwLOBjST7W+t6X5PA2/dokt7fbawaWdWeSP0n3fQkfaZ8UHkqSX0ryyVbHxHctTLvMJN/a+l6f5PdaLfsAvwH8cLrvZvjhtvhjk1yX5DPttUkzMiy0OzsemO6CbF8EXlHdRRtPAd7a9kQAvh64pKq+GXgE+NmquojumjunVNUpgwtKciLwKuDb6L4P4aeTPLc9vRR4e1UdBzwM/K9hCk/ykjbvSXR7BicOXFRyumX+KfAzVfV8YBtAdZfcfwPw/qo6oare3/p+A3BqW/4b27WkpGkZFtpTBfitJLcCf0d3WeeJ4akHquqf2vR76S6fMZPvAC6vqi9U9/0Ifwl8Z3vu3qq6uU3fCCwZsr6XtNungJvo3tyXTrfMduzkoKr659b+vp7l/3V134HwObqL0u3o0Jz2EHuNuwBphNYBPzjNcz8KLAROrKrH21U+92vPTb4GTt81caa6TPSExwamtwHDDkMF+O2q+uOvaOy+n2KqZc5UwzB1+V6gGblnod3ZR4F9k/z0REMb1/9u4Bl03xvweJJTgK8emO/ZSZ7fpl8JfLxNb6X7mtLJ/gE4ox3bOIDuSqX/uJO1/y3wk+m+k4Iki5JM+6VXVfUfwNYkJ7emFQNPT1e3NDTDQrut6q6S+Qrge9ups+vovl95I/DnwLIka+n2Mu4amPVOYGUbojqM7stwAC4Brp44wD2wnpuAS+kuDf0J4J1V9antLPfXkmyYuFXVR+iGkq5PchvwAfrf8M8GLklyPd2exn+29o/RHdAePMAtbRevOisNaMM8H6qqmb7FbU5KcmA7ZkKS8+gurf3qMZel3YTjlNLu4+VJzqf7u/4s8BPjLUe7E/csJEm9PGYhSeplWEiSehkWkqRehoUkqZdhIUnq9f8B5geAXAusUjEAAAAASUVORK5CYII=",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "visualize_word_length(dataset='train')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
